bitkeeper revision 1.1159.72.2 (413cb4b0nYQ7KFQbxIn6g-4lsRAgbQ)

author cl349@labyrinth.cl.cam.ac.uk <cl349@labyrinth.cl.cam.ac.uk>

Mon, 6 Sep 2004 19:04:16 +0000 (19:04 +0000)

committer cl349@labyrinth.cl.cam.ac.uk <cl349@labyrinth.cl.cam.ac.uk>

Mon, 6 Sep 2004 19:04:16 +0000 (19:04 +0000)
author cl349@labyrinth.cl.cam.ac.uk <cl349@labyrinth.cl.cam.ac.uk>
Mon, 6 Sep 2004 19:04:16 +0000 (19:04 +0000)
committer cl349@labyrinth.cl.cam.ac.uk <cl349@labyrinth.cl.cam.ac.uk>
Mon, 6 Sep 2004 19:04:16 +0000 (19:04 +0000)
diff --git a/.rootkeys b/.rootkeys

index 98d2a0e4e8a753e1f33ce4647e7000c5aabbbc23..60f058749089c29de3cadc4985a649316cb1553a 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
@@ -263,6 +263,33 @@
  413cb1e5c_Mkxf_X0zimEhTKI_l4DA netbsd-2.0-xen-sparse/mkbuildtree
  413cb1e5kY_Zil7-b0kI6hvCIxBEYg netbsd-2.0-xen-sparse/nbconfig-xen
  413cb1e5-58q5doPifcE1Q8ZAgm-JQ netbsd-2.0-xen-sparse/nbmake-xen
+413cb3b3Cmp02Gj87f3wwu2W9y0gBg netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN
+413cb3b3aUP9GmUWqHWQ2SRp1qXnqQ netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen
+413cb3b3pZuLKElEpQwX1C-3hLW4qA netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c
+413cb3b34ui1cCGaSqIeLiBgMp-PDw netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c
+413cb3b3i11i2GVGn0YGlRbM3ifbPQ netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c
+413cb3b3FgMboWw-Pm3XdbBFSlZl_g netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S
+413cb3b4ABCSfkHRmbsWfnZNG28nBA netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c
+413cb3b4bvVJ7UlliMSH60J4uIb9kA netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c
+413cb3b4aKd9SUY-OzUiTF0Gb9ve9w netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c
+413cb3b4jUtWl-sP493PvB27o-Iltw netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S
+413cb3b4ElwwoJEmmzflV0HgK5Qxcg netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c
+413cb3b4k9OVRCxuSdhKt-2baTp_Yg netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h
+413cb3b4bRsqiHQLTKEZk4-zOksf8A netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h
+413cb3b4OqY83qI8GztIZGADpvrpSw netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h
+413cb3b42GG0LffraTnpZKlSUq57wg netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h
+413cb3b4F0ArkWVBRyspkw7ivfXihg netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h
+413cb3b4ullQud70n4JClwoEEUBh8Q netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h
+413cb3b4y1Ffq8BOhbdSpn-fGmKuEg netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h
+413cb3b4uXOFcT56QuLt1fcDrB-4Zg netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c
+413cb3b4hIffjrKn3zhVqJmH6ueB3Q netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c
+413cb3b4eNdRIasCoQIuX4Nu39Dlqw netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c
+413cb3b40DLJLbX_ZUIULB0JFjBuaw netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c
+413cb3b46JnvK1UurZAubeQoFg1W-w netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c
+413cb3b5rIKB3TbyhK3pbNyVkYysqA netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c
+413cb3b5eKxnzoodEqaWn2wrPnHWnA netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c
+413cb3b5F56TvQWAmO5TsuzhtzLFPQ netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c
+413cb3b53nyOv1OIeDSsCXhBFDXvJA netbsd-2.0-xen-sparse/sys/nfs/files.nfs
  40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs
  3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile
  4124b307nRyK3dhn1hAsvrY76NuV3g tools/check/Makefile
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN

new file mode 100644 (file)

index 0000000..2fbb999
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN
@@ -0,0 +1,176 @@
+# $NetBSD: XEN,v 1.1.2.2 2004/07/15 20:19:34 he Exp $
+
+include        "arch/xen/conf/std.xen"
+
+options        INCLUDE_CONFIG_FILE     # embed config file in kernel binary
+
+#options               UVMHIST
+#options               UVMHIST_PRINT
+#options               SYSCALL_DEBUG
+
+maxusers       32              # estimated number of users
+
+#
+options                XEN
+#options               DOM0OPS
+options                HZ=50
+
+#options       I586_CPU
+options        I686_CPU
+
+#options       VM86            # virtual 8086 emulation
+#options       USER_LDT        # user-settable LDT; used by WINE
+
+#options       MTRR            # memory-type range register syscall support
+
+#options       CONSDEVNAME="\"xencons\""
+#options       CONS_OVERRIDE
+
+options                INSECURE        # disable kernel security levels - X needs this
+
+options        RTC_OFFSET=0    # hardware clock is this many mins. west of GMT
+#options       NTP             # NTP phase/frequency locked loop
+
+options        KTRACE          # system call tracing via ktrace(1)
+#options       SYSTRACE        # system call vetting via systrace(1)
+
+options        SYSVMSG         # System V-like message queues
+options        SYSVSEM         # System V-like semaphores
+#options       SEMMNI=10       # number of semaphore identifiers
+#options       SEMMNS=60       # number of semaphores in system
+#options       SEMUME=10       # max number of undo entries per process
+#options       SEMMNU=30       # number of undo structures in system
+options        SYSVSHM         # System V-like memory sharing
+#options       SHMMAXPGS=2048  # 2048 pages is the default
+options        P1003_1B_SEMAPHORE      # p1003.1b semaphore support
+
+options        LKM             # loadable kernel modules
+
+options        USERCONF        # userconf(4) support
+options        SYSCTL_INCLUDE_DESCR    # Include sysctl descriptions in kernel
+
+# Diagnostic/debugging support options
+options        DIAGNOSTIC      # expensive kernel consistency checks
+options        DEBUG           # expensive debugging checks/support 
+options        KMEMSTATS       # kernel memory statistics (vmstat -m)
+options        DDB             # in-kernel debugger
+options                DDB_ONPANIC=1   # see also sysctl(8): `ddb.onpanic'
+options        DDB_HISTORY_SIZE=512    # enable history editing in DDB
+#options       KGDB            # remote debugger
+#options       KGDB_DEVNAME="\"com\"",KGDB_DEVADDR=0x2f8,KGDB_DEVRATE=57600
+makeoptions    DEBUG="-g"      # compile full symbol table
+
+#options       COMPAT_14       # NetBSD 1.4
+#options       COMPAT_15       # NetBSD 1.5
+options        COMPAT_16       # NetBSD 1.6
+
+##options      COMPAT_LINUX    # binary compatibility with Linux
+#options       COMPAT_FREEBSD  # binary compatibility with FreeBSD
+#options       COMPAT_MACH     # binary compatibility with Mach binaries
+#options       COMPAT_DARWIN   # binary compatibility with Darwin binaries
+#options       EXEC_MACHO      # exec MACH-O binaries
+#options       COMPAT_PECOFF   # kernel support to run Win32 apps
+
+file-system    FFS             # UFS
+file-system    EXT2FS          # second extended file system (linux)
+#file-system   LFS             # log-structured file system
+#file-system   MFS             # memory file system
+file-system    NFS             # Network File System client
+#file-system   NTFS            # Windows/NT file system (experimental)
+#file-system   CD9660          # ISO 9660 + Rock Ridge file system
+#file-system   MSDOSFS         # MS-DOS file system
+file-system    FDESC           # /dev/fd
+file-system    KERNFS          # /kern
+file-system    NULLFS          # loopback file system
+#file-system   OVERLAY         # overlay file system
+#file-system   PORTAL          # portal filesystem (still experimental)
+file-system    PROCFS          # /proc
+#file-system   UMAPFS          # NULLFS + uid and gid remapping
+#file-system   UNION           # union file system
+#file-system   SMBFS           # experimental - CIFS; also needs nsmb (below)
+
+#options       QUOTA           # UFS quotas
+#options       SOFTDEP         # FFS soft updates support.
+#options       NFSSERVER       # Network File System server
+
+options        GATEWAY         # packet forwarding
+options        INET            # IP + ICMP + TCP + UDP
+options        INET6           # IPV6
+options        IPSEC           # IP security
+options        IPSEC_ESP       # IP security (encryption part; define w/IPSEC)
+options        MROUTING        # IP multicast routing
+options        PFIL_HOOKS      # pfil(9) packet filter hooks
+options        IPFILTER_LOG    # ipmon(8) log support
+
+options        NFS_BOOT_DHCP,NFS_BOOT_BOOTPARAM,NFS_BOOT_BOOTSTATIC
+#options       NFS_BOOTSTATIC_MYIP="\"169.254.1.2\""
+#options       NFS_BOOTSTATIC_GWIP="\"169.254.1.1\""
+#options       NFS_BOOTSTATIC_MASK="\"255.255.255.0\""
+#options       NFS_BOOTSTATIC_SERVADDR="\"169.254.1.1\""
+#options       NFS_BOOTSTATIC_SERVER="\"server:/path/to/root\""
+
+options        WSEMUL_VT100            # VT100 / VT220 emulation
+options        WS_KERNEL_FG=WSCOL_GREEN
+options        WSDISPLAY_COMPAT_PCVT           # emulate some ioctls
+options        WSDISPLAY_COMPAT_SYSCONS        # emulate some ioctls
+options        WSDISPLAY_COMPAT_USL            # VT handling
+options        WSDISPLAY_COMPAT_RAWKBD         # can get raw scancodes
+options        WSDISPLAY_DEFAULTSCREENS=4
+options        PCDISPLAY_SOFTCURSOR
+
+config         netbsd  root on ? type ?
+#config                netbsd  root on wd0a type ffs
+#config                netbsd  root on xennet0 type nfs
+
+mainbus0 at root
+
+cpu* at mainbus?
+
+hypervisor*    at mainbus?             # Xen hypervisor
+
+npx0           at hypervisor?          # x86 math coprocessor
+
+xencons*       at hypervisor?          # Xen virtual console
+xennet*        at hypervisor?          # Xen virtual network interface
+
+#xbd*          at hypervisor?          # Xen virtual block device
+#wd*           at hypervisor?          # Xen vbd (wd identity)
+#sd*           at hypervisor?          # Xen vbd (sd identity)
+#cd*           at hypervisor?          # Xen vbd (cd identity)
+
+#xenkbc*       at hypervisor?          # Xen Keyboard/Mouse Interface
+#pckbd*                at xenkbc?              # Keyboard
+#vga*          at hypervisor?          # Xen VGA display
+#pms*          at xenkbc?              # PS/2 Mouse for wsmouse
+
+#wskbd*                at pckbd? console ?
+#wsdisplay*    at vga? console ?
+#wsmouse*      at pms? mux 0
+
+
+include        "arch/xen/conf/GENERIC.local"
+
+
+pseudo-device  ccd             4       # concatenated/striped disk devices
+#pseudo-device cgd             4       # cryptographic disk devices
+#pseudo-device md              1       # memory disk device (ramdisk)
+#pseudo-device vnd             4       # disk-like interface to files
+
+pseudo-device  bpfilter        8       # Berkeley packet filter
+pseudo-device  ipfilter                # IP filter (firewall) and NAT
+pseudo-device  loop                    # network loopback
+#pseudo-device tun             2       # network tunneling over tty
+#pseudo-device gre             2       # generic L3 over IP tunnel
+#pseudo-device gif             4       # IPv[46] over IPv[46] tunnel (RFC1933)
+#pseudo-device faith           1       # IPv[46] tcp relay translation i/f
+#pseudo-device stf             1       # 6to4 IPv6 over IPv4 encapsulation
+#pseudo-device vlan                    # IEEE 802.1q encapsulation
+#pseudo-device bridge                  # simple inter-network bridging
+
+pseudo-device  pty                     # pseudo-terminals
+pseudo-device  rnd                     # /dev/random and in-kernel generator
+pseudo-device  clockctl                # user control of clock subsystem
+
+pseudo-device  wsmux                   # mouse & keyboard multiplexor
+pseudo-device  wsfont
+pseudo-device  ksyms                   # /dev/ksyms
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen

new file mode 100644 (file)

index 0000000..12f6bfa
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen
@@ -0,0 +1,232 @@
+#      $NetBSD: files.xen,v 1.3.2.1 2004/05/22 15:59:02 he Exp $
+#      NetBSD: files.x86,v 1.10 2003/10/08 17:30:00 bouyer Exp 
+#      NetBSD: files.i386,v 1.254 2004/03/25 23:32:10 jmc Exp 
+
+maxpartitions 8
+
+maxusers 2 16 128
+
+# Processor type options.
+defflag        opt_cputype.h   I686_CPU
+
+# delay before cpu_reset() for reboot.
+defparam               CPURESET_DELAY
+
+# No unmapped page below kernel stack
+defflag                        NOREDZONE
+
+# Beep on halt
+defflag opt_beep.h             BEEP_ONHALT
+defparam opt_beep.h            BEEP_ONHALT_COUNT
+defparam opt_beep.h            BEEP_ONHALT_PITCH BEEP_ONHALT_PERIOD
+
+file   arch/xen/i386/autoconf.c
+file   arch/i386/i386/db_dbgreg.S      ddb | kstack_check_dr0
+file   arch/i386/i386/db_disasm.c      ddb
+file   arch/i386/i386/db_interface.c   ddb
+file   arch/i386/i386/db_memrw.c       ddb | kgdb
+file   arch/i386/i386/db_trace.c       ddb
+file   kern/subr_disk_mbr.c            disk
+file   arch/xen/i386/gdt.c
+file   arch/xen/i386/hypervisor_machdep.c
+file   arch/i386/i386/in_cksum.S       inet | inet6
+file   arch/i386/i386/ipkdb_glue.c     ipkdb
+file   arch/i386/i386/kgdb_machdep.c   kgdb
+file   arch/xen/i386/machdep.c
+file   arch/xen/i386/identcpu.c
+file   arch/i386/i386/math_emulate.c   math_emulate
+file   arch/i386/i386/mem.c
+file   kern/kern_microtime.c           i586_cpu | i686_cpu
+file   arch/i386/i386/mtrr_k6.c        mtrr
+file   netns/ns_cksum.c                ns
+file   arch/xen/i386/pmap.c
+file   arch/i386/i386/process_machdep.c
+file   arch/i386/i386/procfs_machdep.c procfs
+file   arch/xen/i386/sys_machdep.c
+file   arch/i386/i386/syscall.c
+file   arch/xen/i386/trap.c
+file   arch/i386/i386/vm_machdep.c
+file   arch/xen/i386/xen_machdep.c
+
+file   arch/xen/xen/xen_debug.c
+
+file   arch/xen/xen/clock.c
+file   arch/xen/xen/evtchn.c
+file   arch/xen/xen/ctrl_if.c
+
+file   dev/cons.c
+
+file   arch/i386/i386/mptramp.S                multiprocessor
+file    arch/i386/i386/ipifuncs.c      multiprocessor
+
+file   arch/i386/i386/pmc.c            perfctrs
+
+file   crypto/des/arch/i386/des_enc.S          des
+file   crypto/des/arch/i386/des_cbc.S          des
+
+file   crypto/blowfish/arch/i386/bf_enc.S      blowfish
+file   crypto/blowfish/arch/i386/bf_cbc.S      blowfish & !i386_cpu
+
+#
+# Machine-independent SCSI drivers
+#
+
+#xxx include   "dev/scsipi/files.scsipi"
+
+#
+# Machine-independent ATA drivers
+#
+
+#xxx include   "dev/ata/files.ata"
+
+# Memory Disk for install floppy
+file   dev/md_root.c                   memory_disk_hooks
+
+#
+define  mainbus { [apid = -1] }
+
+file   arch/x86/x86/bus_dma.c
+file   arch/xen/x86/bus_space.c
+file   arch/x86/x86/cacheinfo.c
+file   arch/xen/x86/consinit.c
+file   arch/xen/x86/intr.c
+file   arch/x86/x86/ipi.c              multiprocessor
+file   arch/x86/x86/lock_machdep.c     lockdebug
+file   arch/x86/x86/softintr.c
+
+include        "arch/xen/conf/files.compat"
+
+#
+# System bus types
+#
+
+device mainbus: mainbus
+attach mainbus at root
+file   arch/xen/i386/mainbus.c         mainbus
+
+# Xen hypervisor
+device hypervisor { }
+attach hypervisor at mainbus
+file   arch/xen/xen/hypervisor.c       hypervisor needs-flag
+
+# Numeric Processing Extension; Math Co-processor
+device npx
+file   arch/xen/i386/npx.c             npx needs-flag
+
+attach npx at hypervisor with npx_hv
+file   arch/xen/i386/npx_hv.c          npx_hv
+
+# Xen console support
+device xencons: tty
+attach xencons at hypervisor
+file   arch/xen/xen/xencons.c          xencons needs-flag
+
+include        "dev/wscons/files.wscons"
+include        "dev/wsfont/files.wsfont"
+
+include        "dev/pckbport/files.pckbport"
+
+# CPUS
+
+define cpu { [apid = -1] }
+device cpu
+attach cpu at mainbus
+file   arch/xen/i386/cpu.c             cpu
+
+#
+# Compatibility modules
+#
+
+# VM86 mode
+file   arch/i386/i386/vm86.c                   vm86
+
+# VM86 in kernel
+file   arch/i386/i386/kvm86.c                  kvm86
+file   arch/i386/i386/kvm86call.S              kvm86
+
+# Binary compatibility with previous NetBSD releases (COMPAT_XX)
+file   arch/i386/i386/compat_13_machdep.c      compat_13 | compat_aout
+file   arch/i386/i386/compat_16_machdep.c      compat_16 | compat_ibcs2
+
+# SVR4 binary compatibility (COMPAT_SVR4)
+include        "compat/svr4/files.svr4"
+file   arch/i386/i386/svr4_machdep.c           compat_svr4
+file   arch/i386/i386/svr4_sigcode.S           compat_svr4
+file   arch/i386/i386/svr4_syscall.c           compat_svr4
+
+# MACH binary compatibility (COMPAT_MACH)
+include        "compat/mach/files.mach"
+file   arch/i386/i386/mach_machdep.c           compat_mach | compat_darwin
+file   arch/i386/i386/mach_sigcode.S           compat_mach | compat_darwin
+file   arch/i386/i386/mach_syscall.c           compat_mach | compat_darwin
+file   arch/i386/i386/macho_machdep.c          exec_macho
+
+# DARWIN binary compatibility (COMPAT_DARWIN)
+include        "compat/darwin/files.darwin"
+file   arch/i386/i386/darwin_machdep.c         compat_darwin
+
+# iBCS-2 binary compatibility (COMPAT_IBCS2)
+include        "compat/ibcs2/files.ibcs2"
+file   arch/i386/i386/ibcs2_machdep.c          compat_ibcs2
+file   arch/i386/i386/ibcs2_sigcode.S          compat_ibcs2
+file   arch/i386/i386/ibcs2_syscall.c          compat_ibcs2
+
+# Linux binary compatibility (COMPAT_LINUX)
+include        "compat/linux/files.linux"
+include        "compat/linux/arch/i386/files.linux_i386"
+file   arch/i386/i386/linux_sigcode.S          compat_linux
+file   arch/i386/i386/linux_syscall.c          compat_linux
+file   arch/i386/i386/linux_trap.c             compat_linux
+
+# FreeBSD binary compatibility (COMPAT_FREEBSD)
+include        "compat/freebsd/files.freebsd"
+file   arch/i386/i386/freebsd_machdep.c        compat_freebsd
+file   arch/i386/i386/freebsd_sigcode.S        compat_freebsd
+file   arch/i386/i386/freebsd_syscall.c        compat_freebsd
+
+# a.out binary compatibility (COMPAT_AOUT)
+include        "compat/aout/files.aout"
+
+# Win32 binary compatibility (COMPAT_PECOFF)
+include        "compat/pecoff/files.pecoff"
+
+# OSS audio driver compatibility
+include        "compat/ossaudio/files.ossaudio"
+
+# Xen devices
+
+# Network driver
+device xennet: arp, ether, ifnet
+attach xennet at hypervisor
+file   arch/xen/xen/if_xennet.c        xennet needs-flag
+
+# Block device driver and wd/sd/cd identities
+device xbd: disk
+attach xbd at hypervisor
+file   arch/xen/xen/xbd.c              xbd | wd | sd | cd needs-flag
+
+device wd: disk
+attach wd at hypervisor
+
+device sd: disk
+attach sd at hypervisor
+
+device cd: disk
+attach cd at hypervisor
+
+# Keyboard
+device xenkbc: pckbport
+attach xenkbc at hypervisor
+file   arch/xen/xen/xenkbc.c           xenkbc          needs-flag
+
+# Generic VGA
+attach vga at hypervisor with vga_xen
+file   arch/xen/xen/vga_xen.c          vga_xen         needs-flag
+
+# Domain-0 operations
+defflag        opt_xen.h                       DOM0OPS
+file   arch/xen/xen/machmem.c          dom0ops
+file   arch/xen/xen/privcmd.c          dom0ops
+file   arch/xen/xen/vfr.c              dom0ops
+
+include "arch/xen/conf/majors.i386"
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c

new file mode 100644 (file)

index 0000000..766b7aa
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c
@@ -0,0 +1,630 @@
+/*     $NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $      */
+/*     NetBSD: autoconf.c,v 1.75 2003/12/30 12:33:22 pk Exp    */
+
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)autoconf.c  7.1 (Berkeley) 5/9/91
+ */
+
+/*
+ * Setup the system to run on the current machine.
+ *
+ * Configure() is called at boot time and initializes the vba
+ * device tables and the memory controller monitoring.  Available
+ * devices are determined (from possibilities mentioned in ioconf.c),
+ * and the drivers are initialized.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $");
+
+#include "opt_compat_oldboot.h"
+#include "opt_multiprocessor.h"
+#include "opt_nfs_boot.h"
+#include "xennet.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/disklabel.h>
+#include <sys/conf.h>
+#ifdef COMPAT_OLDBOOT
+#include <sys/reboot.h>
+#endif
+#include <sys/device.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <sys/fcntl.h>
+#include <sys/dkio.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+
+#ifdef NFS_BOOT_BOOTSTATIC
+#include <net/if.h>
+#include <net/if_ether.h>
+#include <netinet/in.h>
+#include <nfs/rpcv2.h>
+#include <nfs/nfsproto.h>
+#include <nfs/nfs.h>
+#include <nfs/nfsmount.h>
+#include <nfs/nfsdiskless.h>
+#include <machine/if_xennetvar.h>
+#endif
+
+#include <machine/pte.h>
+#include <machine/cpu.h>
+#include <machine/gdt.h>
+#include <machine/pcb.h>
+#include <machine/bootinfo.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+
+#if NIOAPIC > 0
+#include <machine/i82093var.h>
+#endif
+
+#if NLAPIC > 0
+#include <machine/i82489var.h>
+#endif
+
+static int match_harddisk(struct device *, struct btinfo_bootdisk *);
+static void matchbiosdisks(void);
+static void findroot(void);
+static int is_valid_disk(struct device *);
+
+extern struct disklist *i386_alldisks;
+extern int i386_ndisks;
+
+#include "bios32.h"
+#if NBIOS32 > 0
+#include <machine/bios32.h>
+#endif
+
+#include "opt_pcibios.h"
+#ifdef PCIBIOS
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+#include <i386/pci/pcibios.h>
+#endif
+
+#include "opt_kvm86.h"
+#ifdef KVM86
+#include <machine/kvm86.h>
+#endif
+
+#include "opt_xen.h"
+
+struct device *booted_device;
+int booted_partition;
+
+/*
+ * Determine i/o configuration for a machine.
+ */
+void
+cpu_configure(void)
+{
+
+       startrtclock();
+
+#if NBIOS32 > 0
+       bios32_init();
+#endif
+#ifdef PCIBIOS
+       pcibios_init();
+#endif
+
+       /* kvm86 needs a TSS */
+       i386_proc0_tss_ldt_init();
+#ifdef KVM86
+       kvm86_init();
+#endif
+
+       if (config_rootfound("mainbus", NULL) == NULL)
+               panic("configure: mainbus not configured");
+
+#ifdef INTRDEBUG
+       intr_printconfig();
+#endif
+
+#if NIOAPIC > 0
+       lapic_set_lvt();
+       ioapic_enable();
+#endif
+       /* resync cr0 after FPU configuration */
+       lwp0.l_addr->u_pcb.pcb_cr0 = rcr0();
+#ifdef MULTIPROCESSOR
+       /* propagate this to the idle pcb's. */
+       cpu_init_idle_pcbs();
+#endif
+
+       spl0();
+#if NLAPIC > 0
+       lapic_tpr = 0;
+#endif
+}
+
+void
+cpu_rootconf(void)
+{
+       findroot();
+       matchbiosdisks();
+
+       printf("boot device: %s\n",
+           booted_device ? booted_device->dv_xname : "<unknown>");
+
+       setroot(booted_device, booted_partition);
+}
+
+/*
+ * XXX ugly bit of code. But, this is the only safe time that the
+ * match between BIOS disks and native disks can be done.
+ */
+static void
+matchbiosdisks(void)
+{
+       struct btinfo_biosgeom *big;
+       struct bi_biosgeom_entry *be;
+       struct device *dv;
+       int i, ck, error, m, n;
+       struct vnode *tv;
+       char mbr[DEV_BSIZE];
+       int  dklist_size;
+       int bmajor;
+
+       big = lookup_bootinfo(BTINFO_BIOSGEOM);
+
+       if (big == NULL)
+               return;
+
+       /*
+        * First, count all native disks
+        */
+       for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next)
+               if (is_valid_disk(dv))
+                       i386_ndisks++;
+
+       if (i386_ndisks == 0)
+               return;
+
+       dklist_size = sizeof (struct disklist) + (i386_ndisks - 1) *
+           sizeof (struct nativedisk_info);
+
+       /* XXX M_TEMP is wrong */
+       i386_alldisks = malloc(dklist_size, M_TEMP, M_NOWAIT);
+       if (i386_alldisks == NULL)
+               return;
+
+       memset(i386_alldisks, 0, dklist_size);
+
+       i386_alldisks->dl_nnativedisks = i386_ndisks;
+       i386_alldisks->dl_nbiosdisks = big->num;
+       for (i = 0; i < big->num; i++) {
+               i386_alldisks->dl_biosdisks[i].bi_dev = big->disk[i].dev;
+               i386_alldisks->dl_biosdisks[i].bi_sec = big->disk[i].sec;
+               i386_alldisks->dl_biosdisks[i].bi_head = big->disk[i].head;
+               i386_alldisks->dl_biosdisks[i].bi_cyl = big->disk[i].cyl;
+               i386_alldisks->dl_biosdisks[i].bi_lbasecs = big->disk[i].totsec;
+               i386_alldisks->dl_biosdisks[i].bi_flags = big->disk[i].flags;
+#ifdef GEOM_DEBUG
+#ifdef NOTYET
+               printf("disk %x: flags %x, interface %x, device %llx\n",
+                       big->disk[i].dev, big->disk[i].flags,
+                       big->disk[i].interface_path, big->disk[i].device_path);
+#endif
+#endif
+       }
+
+       /*
+        * XXX code duplication from findroot()
+        */
+       n = -1;
+       for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+               if (dv->dv_class != DV_DISK)
+                       continue;
+#ifdef GEOM_DEBUG
+               printf("matchbiosdisks: trying to match (%s) %s\n",
+                   dv->dv_xname, dv->dv_cfdata->cf_name);
+#endif
+               if (is_valid_disk(dv)) {
+                       n++;
+                       sprintf(i386_alldisks->dl_nativedisks[n].ni_devname,
+                           "%s%d", dv->dv_cfdata->cf_name,
+                           dv->dv_unit);
+
+                       bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
+                       if (bmajor == -1)
+                               return;
+
+                       if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, RAW_PART),
+                           &tv))
+                               panic("matchbiosdisks: can't alloc vnode");
+
+                       error = VOP_OPEN(tv, FREAD, NOCRED, 0);
+                       if (error) {
+                               vput(tv);
+                               continue;
+                       }
+                       error = vn_rdwr(UIO_READ, tv, mbr, DEV_BSIZE, 0,
+                           UIO_SYSSPACE, 0, NOCRED, NULL, 0);
+                       VOP_CLOSE(tv, FREAD, NOCRED, 0);
+                       if (error) {
+#ifdef GEOM_DEBUG
+                               printf("matchbiosdisks: %s: MBR read failure\n",
+                                   dv->dv_xname);
+#endif
+                               continue;
+                       }
+
+                       for (ck = i = 0; i < DEV_BSIZE; i++)
+                               ck += mbr[i];
+                       for (m = i = 0; i < big->num; i++) {
+                               be = &big->disk[i];
+#ifdef GEOM_DEBUG
+                               printf("match %s with %d ", dv->dv_xname, i);
+                               printf("dev ck %x bios ck %x\n", ck, be->cksum);
+#endif
+                               if (be->flags & BI_GEOM_INVALID)
+                                       continue;
+                               if (be->cksum == ck &&
+                                   !memcmp(&mbr[MBR_PART_OFFSET], be->dosparts,
+                                       MBR_PART_COUNT *
+                                           sizeof (struct mbr_partition))) {
+#ifdef GEOM_DEBUG
+                                       printf("matched bios disk %x with %s\n",
+                                           be->dev, dv->dv_xname);
+#endif
+                                       i386_alldisks->dl_nativedisks[n].
+                                           ni_biosmatches[m++] = i;
+                               }
+                       }
+                       i386_alldisks->dl_nativedisks[n].ni_nmatches = m;
+                       vput(tv);
+               }
+       }
+}
+
+#ifdef COMPAT_OLDBOOT
+u_long bootdev = 0;            /* should be dev_t, but not until 32 bits */
+#endif
+
+/*
+ * helper function for "findroot()":
+ * return nonzero if disk device matches bootinfo
+ */
+static int
+match_harddisk(struct device *dv, struct btinfo_bootdisk *bid)
+{
+       struct vnode *tmpvn;
+       int error;
+       struct disklabel label;
+       int found = 0;
+       int bmajor;
+
+       /*
+        * A disklabel is required here.  The
+        * bootblocks don't refuse to boot from
+        * a disk without a label, but this is
+        * normally not wanted.
+        */
+       if (bid->labelsector == -1)
+               return(0);
+
+       /*
+        * lookup major number for disk block device
+        */
+       bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
+       if (bmajor == -1)
+               return(0); /* XXX panic() ??? */
+
+       /*
+        * Fake a temporary vnode for the disk, open
+        * it, and read the disklabel for comparison.
+        */
+       if (bdevvp(MAKEDISKDEV(bmajor, dv->dv_unit, bid->partition), &tmpvn))
+               panic("findroot can't alloc vnode");
+       error = VOP_OPEN(tmpvn, FREAD, NOCRED, 0);
+       if (error) {
+#ifndef DEBUG
+               /*
+                * Ignore errors caused by missing
+                * device, partition or medium.
+                */
+               if (error != ENXIO && error != ENODEV)
+#endif
+                       printf("findroot: can't open dev %s%c (%d)\n",
+                              dv->dv_xname, 'a' + bid->partition, error);
+               vput(tmpvn);
+               return(0);
+       }
+       error = VOP_IOCTL(tmpvn, DIOCGDINFO, &label, FREAD, NOCRED, 0);
+       if (error) {
+               /*
+                * XXX can't happen - open() would
+                * have errored out (or faked up one)
+                */
+               printf("can't get label for dev %s%c (%d)\n",
+                      dv->dv_xname, 'a' + bid->partition, error);
+               goto closeout;
+       }
+
+       /* compare with our data */
+       if (label.d_type == bid->label.type &&
+           label.d_checksum == bid->label.checksum &&
+           !strncmp(label.d_packname, bid->label.packname, 16))
+               found = 1;
+
+closeout:
+       VOP_CLOSE(tmpvn, FREAD, NOCRED, 0);
+       vput(tmpvn);
+       return(found);
+}
+
+/*
+ * Attempt to find the device from which we were booted.
+ * If we can do so, and not instructed not to do so,
+ * change rootdev to correspond to the load device.
+ */
+void
+findroot(void)
+{
+       struct btinfo_bootdisk *bid;
+       struct device *dv;
+       union xen_cmdline_parseinfo xcp;
+#ifdef COMPAT_OLDBOOT
+       int i, majdev, unit, part;
+       char buf[32];
+#endif
+
+       if (booted_device)
+               return;
+
+       if (lookup_bootinfo(BTINFO_NETIF)) {
+               /*
+                * We got netboot interface information, but
+                * "device_register()" couldn't match it to a configured
+                * device. Bootdisk information cannot be present at the
+                * same time, so give up.
+                */
+               printf("findroot: netboot interface not found\n");
+               return;
+       }
+
+       bid = lookup_bootinfo(BTINFO_BOOTDISK);
+       if (bid) {
+               /*
+                * Scan all disk devices for ones that match the passed data.
+                * Don't break if one is found, to get possible multiple
+                * matches - for problem tracking. Use the first match anyway
+                * because lower device numbers are more likely to be the
+                * boot device.
+                */
+               for (dv = alldevs.tqh_first; dv != NULL;
+                   dv = dv->dv_list.tqe_next) {
+                       if (dv->dv_class != DV_DISK)
+                               continue;
+
+                       if (!strcmp(dv->dv_cfdata->cf_name, "fd")) {
+                               /*
+                                * Assume the configured unit number matches
+                                * the BIOS device number.  (This is the old
+                                * behaviour.)  Needs some ideas how to handle
+                                * BIOS's "swap floppy drive" options.
+                                */
+                               if ((bid->biosdev & 0x80) ||
+                                   dv->dv_unit != bid->biosdev)
+                                       continue;
+
+                               goto found;
+                       }
+
+                       if (is_valid_disk(dv)) {
+                               /*
+                                * Don't trust BIOS device numbers, try
+                                * to match the information passed by the
+                                * bootloader instead.
+                                */
+                               if ((bid->biosdev & 0x80) == 0 ||
+                                   !match_harddisk(dv, bid))
+                                       continue;
+
+                               goto found;
+                       }
+
+                       /* no "fd", "wd", "sd", "ld", "ed" */
+                       continue;
+
+found:
+                       if (booted_device) {
+                               printf("warning: double match for boot "
+                                   "device (%s, %s)\n",
+                                   booted_device->dv_xname, dv->dv_xname);
+                               continue;
+                       }
+                       booted_device = dv;
+                       booted_partition = bid->partition;
+               }
+
+               if (booted_device)
+                       return;
+       }
+
+       xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp);
+
+       for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+               if (is_valid_disk(dv) == 0)
+                       continue;
+
+               if (xcp.xcp_bootdev[0] == 0) {
+                       booted_device = dv;
+                       break;
+               }
+
+               if (strncmp(xcp.xcp_bootdev, dv->dv_xname,
+                   strlen(dv->dv_xname)))
+                       continue;
+
+               if (strlen(xcp.xcp_bootdev) > strlen(dv->dv_xname)) {
+                       booted_partition = toupper(
+                               xcp.xcp_bootdev[strlen(dv->dv_xname)]) - 'A';
+               }
+
+               booted_device = dv;
+               break;
+       }
+
+       if (booted_device)
+               return;
+
+#ifdef COMPAT_OLDBOOT
+#if 0
+       printf("howto %x bootdev %x ", boothowto, bootdev);
+#endif
+
+       if ((bootdev & B_MAGICMASK) != (u_long)B_DEVMAGIC)
+               return;
+
+       majdev = (bootdev >> B_TYPESHIFT) & B_TYPEMASK;
+       name = devsw_blk2name(majdev);
+       if (name == NULL)
+               return;
+
+       part = (bootdev >> B_PARTITIONSHIFT) & B_PARTITIONMASK;
+       unit = (bootdev >> B_UNITSHIFT) & B_UNITMASK;
+
+       sprintf(buf, "%s%d", name, unit);
+       for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+               if (strcmp(buf, dv->dv_xname) == 0) {
+                       booted_device = dv;
+                       booted_partition = part;
+                       return;
+               }
+       }
+#endif
+}
+
+#include "pci.h"
+
+#include <dev/isa/isavar.h>
+#if NPCI > 0
+#include <dev/pci/pcivar.h>
+#endif
+
+void
+device_register(struct device *dev, void *aux)
+{
+       /*
+        * Handle network interfaces here, the attachment information is
+        * not available driver independantly later.
+        * For disks, there is nothing useful available at attach time.
+        */
+#if NXENNET > 0
+       if (dev->dv_class == DV_IFNET) {
+               union xen_cmdline_parseinfo xcp;
+
+               xen_parse_cmdline(XEN_PARSE_BOOTDEV, &xcp);
+               if (strncmp(xcp.xcp_bootdev, dev->dv_xname, 16) == 0) {
+#ifdef NFS_BOOT_BOOTSTATIC
+                       nfs_bootstatic_callback = xennet_bootstatic_callback;
+#endif
+                       goto found;
+               }
+       }
+#endif
+       if (dev->dv_class == DV_IFNET) {
+               struct btinfo_netif *bin = lookup_bootinfo(BTINFO_NETIF);
+               if (bin == NULL)
+                       return;
+
+               /*
+                * We don't check the driver name against the device name
+                * passed by the boot ROM. The ROM should stay usable
+                * if the driver gets obsoleted.
+                * The physical attachment information (checked below)
+                * must be sufficient to identify the device.
+                */
+
+               if (bin->bus == BI_BUS_ISA &&
+                   !strcmp(dev->dv_parent->dv_cfdata->cf_name, "isa")) {
+                       struct isa_attach_args *iaa = aux;
+
+                       /* compare IO base address */
+                       /* XXXJRT what about multiple I/O addrs? */
+                       if (iaa->ia_nio > 0 &&
+                           bin->addr.iobase == iaa->ia_io[0].ir_addr)
+                               goto found;
+               }
+#if NPCI > 0
+               if (bin->bus == BI_BUS_PCI &&
+                   !strcmp(dev->dv_parent->dv_cfdata->cf_name, "pci")) {
+                       struct pci_attach_args *paa = aux;
+                       int b, d, f;
+
+                       /*
+                        * Calculate BIOS representation of:
+                        *
+                        *      <bus,device,function>
+                        *
+                        * and compare.
+                        */
+                       pci_decompose_tag(paa->pa_pc, paa->pa_tag, &b, &d, &f);
+                       if (bin->addr.tag == ((b << 8) | (d << 3) | f))
+                               goto found;
+               }
+#endif
+       }
+       return;
+
+found:
+       if (booted_device) {
+               /* XXX should be a "panic()" */
+               printf("warning: double match for boot device (%s, %s)\n",
+                   booted_device->dv_xname, dev->dv_xname);
+               return;
+       }
+       booted_device = dev;
+}
+
+static int
+is_valid_disk(struct device *dv)
+{
+       const char *name;
+
+       if (dv->dv_class != DV_DISK)
+               return (0);
+
+       name = dv->dv_cfdata->cf_name;
+
+       return (strcmp(name, "sd") == 0 || strcmp(name, "wd") == 0 ||
+           strcmp(name, "ld") == 0 || strcmp(name, "ed") == 0 ||
+           strcmp(name, "xbd") == 0);
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c

new file mode 100644 (file)

index 0000000..23dd52f
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c
@@ -0,0 +1,408 @@
+/*     $NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $       */
+/*     NetBSD: gdt.c,v 1.32 2004/02/13 11:36:13 wiz Exp        */
+
+/*-
+ * Copyright (c) 1996, 1997 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by John T. Kohl and Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: gdt.c,v 1.1 2004/03/11 21:44:08 cl Exp $");
+
+#include "opt_multiprocessor.h"
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/user.h>
+
+#include <uvm/uvm.h>
+
+#include <machine/gdt.h>
+
+int gdt_size[2];       /* total number of GDT entries */
+int gdt_count[2];      /* number of GDT entries in use */
+int gdt_next[2];       /* next available slot for sweeping */
+int gdt_free[2];       /* next free slot; terminated with GNULL_SEL */
+
+struct lock gdt_lock_store;
+
+static __inline void gdt_lock(void);
+static __inline void gdt_unlock(void);
+void gdt_init(void);
+void gdt_grow(int);
+int gdt_get_slot(void);
+int gdt_get_slot1(int);
+void gdt_put_slot(int);
+void gdt_put_slot1(int, int);
+
+/*
+ * Lock and unlock the GDT, to avoid races in case gdt_{ge,pu}t_slot() sleep
+ * waiting for memory.
+ *
+ * Note that the locking done here is not sufficient for multiprocessor
+ * systems.  A freshly allocated slot will still be of type SDT_SYSNULL for
+ * some time after the GDT is unlocked, so gdt_compact() could attempt to
+ * reclaim it.
+ */
+static __inline void
+gdt_lock()
+{
+
+       (void) lockmgr(&gdt_lock_store, LK_EXCLUSIVE, NULL);
+}
+
+static __inline void
+gdt_unlock()
+{
+
+       (void) lockmgr(&gdt_lock_store, LK_RELEASE, NULL);
+}
+
+void
+setgdt(int sel, void *base, size_t limit,
+    int type, int dpl, int def32, int gran)
+{
+       struct segment_descriptor sd;
+       CPU_INFO_ITERATOR cii;
+       struct cpu_info *ci;
+
+       if (type == SDT_SYS386TSS) {
+               /* printk("XXX TSS descriptor not supported in GDT\n"); */
+               return;
+       }
+
+       setsegment(&sd, base, limit, type, dpl, def32, gran);
+       for (CPU_INFO_FOREACH(cii, ci)) {
+               if (ci->ci_gdt != NULL) {
+#ifndef XEN
+                       ci->ci_gdt[sel].sd = sd;
+#else
+                       xen_update_descriptor(&ci->ci_gdt[sel],
+                           (union descriptor *)&sd);
+#endif
+               }
+       }
+}
+
+/*
+ * Initialize the GDT subsystem.  Called from autoconf().
+ */
+void
+gdt_init()
+{
+       size_t max_len, min_len;
+       union descriptor *old_gdt;
+       struct vm_page *pg;
+       vaddr_t va;
+       struct cpu_info *ci = &cpu_info_primary;
+
+       lockinit(&gdt_lock_store, PZERO, "gdtlck", 0, 0);
+
+       max_len = MAXGDTSIZ * sizeof(gdt[0]);
+       min_len = MINGDTSIZ * sizeof(gdt[0]);
+
+       gdt_size[0] = MINGDTSIZ;
+       gdt_count[0] = NGDT;
+       gdt_next[0] = NGDT;
+       gdt_free[0] = GNULL_SEL;
+
+       gdt_size[1] = 0;
+       gdt_count[1] = MAXGDTSIZ;
+       gdt_next[1] = MAXGDTSIZ;
+       gdt_free[1] = GNULL_SEL;
+
+       old_gdt = gdt;
+       gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len + max_len);
+       for (va = (vaddr_t)gdt; va < (vaddr_t)gdt + min_len; va += PAGE_SIZE) {
+               pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+               if (pg == NULL) {
+                       panic("gdt_init: no pages");
+               }
+               pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+                   VM_PROT_READ | VM_PROT_WRITE);
+       }
+       memcpy(gdt, old_gdt, NGDT * sizeof(gdt[0]));
+       ci->ci_gdt = gdt;
+       setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1,
+           SDT_MEMRWA, SEL_KPL, 1, 1);
+
+       gdt_init_cpu(ci);
+}
+
+/*
+ * Allocate shadow GDT for a slave CPU.
+ */
+void
+gdt_alloc_cpu(struct cpu_info *ci)
+{
+       int max_len = MAXGDTSIZ * sizeof(gdt[0]);
+       int min_len = MINGDTSIZ * sizeof(gdt[0]);
+       struct vm_page *pg;
+       vaddr_t va;
+
+       ci->ci_gdt = (union descriptor *)uvm_km_valloc(kernel_map, max_len);
+       for (va = (vaddr_t)ci->ci_gdt; va < (vaddr_t)ci->ci_gdt + min_len;
+           va += PAGE_SIZE) {
+               while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO))
+                   == NULL) {
+                       uvm_wait("gdt_alloc_cpu");
+               }
+               pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+                   VM_PROT_READ | VM_PROT_WRITE);
+       }
+       memset(ci->ci_gdt, 0, min_len);
+       memcpy(ci->ci_gdt, gdt, gdt_count[0] * sizeof(gdt[0]));
+       setsegment(&ci->ci_gdt[GCPU_SEL].sd, ci, sizeof(struct cpu_info)-1,
+           SDT_MEMRWA, SEL_KPL, 1, 1);
+}
+
+
+/*
+ * Load appropriate gdt descriptor; we better be running on *ci
+ * (for the most part, this is how a CPU knows who it is).
+ */
+void
+gdt_init_cpu(struct cpu_info *ci)
+{
+#ifndef XEN
+       struct region_descriptor region;
+       size_t max_len;
+
+       max_len = MAXGDTSIZ * sizeof(gdt[0]);
+       setregion(&region, ci->ci_gdt, max_len - 1);
+       lgdt(&region);
+#else
+       size_t len = gdt_size[0] * sizeof(gdt[0]);
+       unsigned long frames[len >> PAGE_SHIFT];
+       vaddr_t va;
+       pt_entry_t *ptp;
+       pt_entry_t *maptp;
+       int f;
+
+       for (va = (vaddr_t)ci->ci_gdt, f = 0;
+            va < (vaddr_t)ci->ci_gdt + len;
+            va += PAGE_SIZE, f++) {
+               KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
+               ptp = kvtopte(va);
+               frames[f] = *ptp >> PAGE_SHIFT;
+               maptp = (pt_entry_t *)vtomach((vaddr_t)ptp);
+               PTE_CLEARBITS(ptp, maptp, PG_RW);
+       }
+       PTE_UPDATES_FLUSH();
+       /* printk("loading gdt %x, %d entries, %d pages", */
+           /* frames[0] << PAGE_SHIFT, gdt_size[0], len >> PAGE_SHIFT); */
+       if (HYPERVISOR_set_gdt(frames, gdt_size[0]))
+               panic("HYPERVISOR_set_gdt failed!\n");
+       lgdt_finish();
+#endif
+}
+
+#ifdef MULTIPROCESSOR
+
+void
+gdt_reload_cpu(struct cpu_info *ci)
+{
+       struct region_descriptor region;
+       size_t max_len;
+
+       max_len = MAXGDTSIZ * sizeof(gdt[0]);
+       setregion(&region, ci->ci_gdt, max_len - 1);
+       lgdt(&region);
+}
+#endif
+
+
+/*
+ * Grow the GDT.
+ */
+void
+gdt_grow(int which)
+{
+       size_t old_len, new_len, max_len;
+       CPU_INFO_ITERATOR cii;
+       struct cpu_info *ci;
+       struct vm_page *pg;
+       vaddr_t va;
+
+       old_len = gdt_size[which] * sizeof(gdt[0]);
+       gdt_size[which] <<= 1;
+       new_len = old_len << 1;
+
+       if (which != 0) {
+               max_len = MAXGDTSIZ * sizeof(gdt[0]);
+               if (old_len == 0) {
+                       gdt_size[which] = MINGDTSIZ;
+                       new_len = gdt_size[which] * sizeof(gdt[0]);
+               }
+               for (va = (vaddr_t)(cpu_info_primary.ci_gdt) + old_len + max_len;
+                    va < (vaddr_t)(cpu_info_primary.ci_gdt) + new_len + max_len;
+                    va += PAGE_SIZE) {
+                       while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) ==
+                           NULL) {
+                               uvm_wait("gdt_grow");
+                       }
+                       pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+                           VM_PROT_READ | VM_PROT_WRITE);
+               }
+               return;
+       }
+
+       for (CPU_INFO_FOREACH(cii, ci)) {
+               for (va = (vaddr_t)(ci->ci_gdt) + old_len;
+                    va < (vaddr_t)(ci->ci_gdt) + new_len;
+                    va += PAGE_SIZE) {
+                       while ((pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO)) ==
+                           NULL) {
+                               uvm_wait("gdt_grow");
+                       }
+                       pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+                           VM_PROT_READ | VM_PROT_WRITE);
+               }
+       }
+}
+
+/*
+ * Allocate a GDT slot as follows:
+ * 1) If there are entries on the free list, use those.
+ * 2) If there are fewer than gdt_size entries in use, there are free slots
+ *    near the end that we can sweep through.
+ * 3) As a last resort, we increase the size of the GDT, and sweep through
+ *    the new slots.
+ */
+int
+gdt_get_slot()
+{
+       return gdt_get_slot1(0);
+}
+
+int
+gdt_get_slot1(int which)
+{
+       size_t offset;
+       int slot;
+
+       gdt_lock();
+
+       if (gdt_free[which] != GNULL_SEL) {
+               slot = gdt_free[which];
+               gdt_free[which] = gdt[slot].gd.gd_selector;
+       } else {
+               offset = which * MAXGDTSIZ * sizeof(gdt[0]);
+               if (gdt_next[which] != gdt_count[which] + offset)
+                       panic("gdt_get_slot botch 1");
+               if (gdt_next[which] - offset >= gdt_size[which]) {
+                       if (gdt_size[which] >= MAXGDTSIZ)
+                               panic("gdt_get_slot botch 2");
+                       gdt_grow(which);
+               }
+               slot = gdt_next[which]++;
+       }
+
+       gdt_count[which]++;
+       gdt_unlock();
+       return (slot);
+}
+
+/*
+ * Deallocate a GDT slot, putting it on the free list.
+ */
+void
+gdt_put_slot(int slot)
+{
+       gdt_put_slot1(slot, 0);
+}
+
+void
+gdt_put_slot1(int slot, int which)
+{
+
+       gdt_lock();
+       gdt_count[which]--;
+
+       gdt[slot].gd.gd_type = SDT_SYSNULL;
+       gdt[slot].gd.gd_selector = gdt_free[which];
+       gdt_free[which] = slot;
+
+       gdt_unlock();
+}
+
+int
+tss_alloc(struct pcb *pcb)
+{
+       int slot;
+
+       slot = gdt_get_slot();
+       setgdt(slot, &pcb->pcb_tss, sizeof(struct pcb) - 1,
+           SDT_SYS386TSS, SEL_KPL, 0, 0);
+       return GSEL(slot, SEL_KPL);
+}
+
+void
+tss_free(int sel)
+{
+
+       gdt_put_slot(IDXSEL(sel));
+}
+
+/*
+ * Caller must have pmap locked for both of these functions.
+ */
+void
+ldt_alloc(struct pmap *pmap, union descriptor *ldt, size_t len)
+{
+       int slot;
+
+       slot = gdt_get_slot1(1);
+#ifndef XEN
+       setgdt(slot, ldt, len - 1, SDT_SYSLDT, SEL_KPL, 0, 0);
+#else
+       cpu_info_primary.ci_gdt[slot].ld.ld_base = (uint32_t)ldt;
+       cpu_info_primary.ci_gdt[slot].ld.ld_entries =
+               len / sizeof(union descriptor);
+#endif
+       pmap->pm_ldt_sel = GSEL(slot, SEL_KPL);
+}
+
+void
+ldt_free(struct pmap *pmap)
+{
+       int slot;
+
+       slot = IDXSEL(pmap->pm_ldt_sel);
+
+       gdt_put_slot1(slot, 1);
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c

new file mode 100644 (file)

index 0000000..e08b5a6
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c
@@ -0,0 +1,230 @@
+/*     $NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $  */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/******************************************************************************
+ * hypervisor.c
+ * 
+ * Communication to/from hypervisor.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.2.2.2 2004/06/17 09:23:13 tron Exp $");
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void
+hypervisor_force_callback(void)
+{
+
+       (void)HYPERVISOR_xen_version(0);
+}
+
+int stipending(void);
+int
+stipending()
+{
+       uint32_t l1;
+       unsigned long l2;
+       unsigned int l1i, l2i, port;
+       int irq;
+       shared_info_t *s = HYPERVISOR_shared_info;
+       struct cpu_info *ci;
+       int ret;
+
+       ret = 0;
+       ci = curcpu();
+
+#if 0
+       if (HYPERVISOR_shared_info->events)
+               printf("stipending events %08lx mask %08lx ilevel %d\n",
+                   HYPERVISOR_shared_info->events,
+                   HYPERVISOR_shared_info->events_mask, ci->ci_ilevel);
+#endif
+
+       /*
+        * we're only called after STIC, so we know that we'll have to
+        * STI at the end
+        */
+       cli();
+       while (s->vcpu_data[0].evtchn_upcall_pending) {
+               s->vcpu_data[0].evtchn_upcall_pending = 0;
+               /* NB. No need for a barrier here -- XCHG is a barrier
+                * on x86. */
+               l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0);
+               while ((l1i = ffs(l1)) != 0) {
+                       l1i--;
+                       l1 &= ~(1 << l1i);
+
+                       l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i];
+                       while ((l2i = ffs(l2)) != 0) {
+                               l2i--;
+                               l2 &= ~(1 << l2i);
+
+                               port = (l1i << 5) + l2i;
+                               if ((irq = evtchn_to_irq[port]) != -1) {
+                                       hypervisor_acknowledge_irq(irq);
+                                       ci->ci_ipending |= (1 << irq);
+                                       if (ret == 0 && ci->ci_ilevel <
+                                           ci->ci_isources[irq]->is_handlers
+                                           ->ih_level)
+                                               ret = 1;
+                               }
+#if 0 /* XXXcl dev/evtchn */
+                               else
+                                       evtchn_device_upcall(port);
+#endif
+                       }
+               }
+       }
+       sti();
+
+#if 0
+       if (ci->ci_ipending & 0x1)
+               printf("stipending events %08lx mask %08lx ilevel %d ipending %08x\n",
+                   HYPERVISOR_shared_info->events,
+                   HYPERVISOR_shared_info->events_mask, ci->ci_ilevel,
+                   ci->ci_ipending);
+#endif
+
+       return (ret);
+}
+
+void do_hypervisor_callback(struct trapframe *regs)
+{
+       uint32_t l1;
+       unsigned long l2;
+       unsigned int l1i, l2i, port;
+       int irq;
+       shared_info_t *s = HYPERVISOR_shared_info;
+       struct cpu_info *ci;
+       int level;
+
+       ci = curcpu();
+       level = ci->ci_ilevel;
+
+       while (s->vcpu_data[0].evtchn_upcall_pending) {
+               s->vcpu_data[0].evtchn_upcall_pending = 0;
+               /* NB. No need for a barrier here -- XCHG is a barrier
+                * on x86. */
+               l1 = x86_atomic_xchg(&s->evtchn_pending_sel, 0);
+               while ((l1i = ffs(l1)) != 0) {
+                       l1i--;
+                       l1 &= ~(1 << l1i);
+
+                       l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i];
+                       while ((l2i = ffs(l2)) != 0) {
+                               l2i--;
+                               l2 &= ~(1 << l2i);
+
+                               port = (l1i << 5) + l2i;
+                               if ((irq = evtchn_to_irq[port]) != -1)
+                                       do_event(irq, regs);
+#if 0 /* XXXcl dev/evtchn */
+                               else
+                                       evtchn_device_upcall(port);
+#endif
+                       }
+               }
+       }
+
+#ifdef DIAGNOSTIC
+       if (level != ci->ci_ilevel)
+               printf("hypervisor done %08x level %d/%d ipending %08x\n",
+                   HYPERVISOR_shared_info->evtchn_pending_sel, level,
+                   ci->ci_ilevel, ci->ci_ipending);
+#endif
+}
+
+void hypervisor_unmask_event(unsigned int ev)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+
+       x86_atomic_clear_bit(&s->evtchn_mask[0], ev);
+       /*
+        * The following is basically the equivalent of
+        * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the
+        * interrupt edge' if the channel is masked.
+        */
+       if (x86_atomic_test_bit(&s->evtchn_pending[0], ev) && 
+           !x86_atomic_test_and_set_bit(&s->evtchn_pending_sel, ev>>5)) {
+               s->vcpu_data[0].evtchn_upcall_pending = 1;
+               if (!s->vcpu_data[0].evtchn_upcall_mask)
+                       hypervisor_force_callback();
+       }
+}
+
+void hypervisor_mask_event(unsigned int ev)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+
+       x86_atomic_set_bit(&s->evtchn_mask[0], ev);
+}
+
+void hypervisor_clear_event(unsigned int ev)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+
+       x86_atomic_clear_bit(&s->evtchn_pending[0], ev);
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S

new file mode 100644 (file)

index 0000000..45af672
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S
@@ -0,0 +1,2000 @@
+/*     $NetBSD: locore.S,v 1.2.2.1 2004/05/22 15:59:48 he Exp $        */
+/*     NetBSD: locore.S,v 1.26 2004/04/12 13:17:46 yamt Exp    */
+
+/*-
+ * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)locore.s    7.3 (Berkeley) 5/13/91
+ */
+
+#include "opt_compat_netbsd.h"
+#include "opt_compat_oldboot.h"
+#include "opt_cputype.h"
+#include "opt_ddb.h"
+#include "opt_ipkdb.h"
+#include "opt_lockdebug.h"
+#include "opt_multiprocessor.h"
+#include "opt_realmem.h"
+#include "opt_user_ldt.h"
+#include "opt_vm86.h"
+#include "opt_xen.h"
+
+#include "npx.h"
+#include "assym.h"
+#include "apm.h"
+#include "lapic.h"
+#include "ioapic.h"
+#include "ksyms.h"
+
+#include <sys/errno.h>
+#include <sys/syscall.h>
+
+#include <machine/cputypes.h>
+#include <machine/param.h>
+#include <machine/pte.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+#include <machine/trap.h>
+#include <machine/bootinfo.h>
+
+#if NLAPIC > 0
+#include <machine/i82489reg.h>
+#endif
+
+/* LINTSTUB: include <sys/types.h> */
+/* LINTSTUB: include <machine/cpu.h> */
+/* LINTSTUB: include <sys/systm.h> */
+
+#include <machine/asm.h>
+
+#if defined(MULTIPROCESSOR)
+       
+#define SET_CURLWP(lwp,cpu)                            \
+       movl    CPUVAR(SELF),cpu                ;       \
+       movl    lwp,CPUVAR(CURLWP)      ;       \
+       movl    cpu,L_CPU(lwp)
+       
+#else
+
+#define SET_CURLWP(lwp,tcpu)           movl    lwp,CPUVAR(CURLWP)
+#define GET_CURLWP(reg)                        movl    CPUVAR(CURLWP),reg
+
+#endif
+
+#define GET_CURPCB(reg)                        movl    CPUVAR(CURPCB),reg      
+#define SET_CURPCB(reg)                        movl    reg,CPUVAR(CURPCB)
+
+#define CLEAR_RESCHED(reg)             movl    reg,CPUVAR(RESCHED)
+
+/* XXX temporary kluge; these should not be here */
+/* Get definitions for IOM_BEGIN, IOM_END, and IOM_SIZE */
+#include <dev/isa/isareg.h>
+
+
+/* Disallow old names for REALBASEMEM */
+#ifdef BIOSBASEMEM
+#error BIOSBASEMEM option deprecated; use REALBASEMEM only if memory size reported by latest boot block is incorrect
+#endif
+
+/* Disallow old names for REALEXTMEM */
+#ifdef EXTMEM_SIZE
+#error EXTMEM_SIZE option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect
+#endif
+#ifdef BIOSEXTMEM
+#error BIOSEXTMEM option deprecated; use REALEXTMEM only if memory size reported by latest boot block is incorrect
+#endif
+
+#include <machine/frameasm.h>
+
+
+#ifdef MULTIPROCESSOR
+#include <machine/i82489reg.h>
+#endif
+       
+/*
+ * PTmap is recursive pagemap at top of virtual address space.
+ * Within PTmap, the page directory can be found (third indirection).
+ *
+ * XXX 4 == sizeof pde
+ */
+       .set    _C_LABEL(PTmap),(PDSLOT_PTE << PDSHIFT)
+       .set    _C_LABEL(PTD),(_C_LABEL(PTmap) + PDSLOT_PTE * PAGE_SIZE)
+       .set    _C_LABEL(PTDpde),(_C_LABEL(PTD) + PDSLOT_PTE * 4)
+
+/*
+ * APTmap, APTD is the alternate recursive pagemap.
+ * It's used when modifying another process's page tables.
+ *
+ * XXX 4 == sizeof pde
+ */
+       .set    _C_LABEL(APTmap),(PDSLOT_APTE << PDSHIFT)
+       .set    _C_LABEL(APTD),(_C_LABEL(APTmap) + PDSLOT_APTE * PAGE_SIZE)
+       .set    _C_LABEL(APTDpde),(_C_LABEL(PTD) + PDSLOT_APTE * 4)
+
+
+/*
+ * Xen guest identifier and loader selection
+ */
+.section __xen_guest
+       .asciz "GUEST_OS=netbsd,GUEST_VER=2.0,XEN_VER=2.0,LOADER=generic"
+
+
+/*
+ * Initialization
+ */
+       .data
+
+       .globl  _C_LABEL(cpu)
+       .globl  _C_LABEL(esym),_C_LABEL(boothowto)
+       .globl  _C_LABEL(bootinfo),_C_LABEL(atdevbase)
+#ifdef COMPAT_OLDBOOT
+       .globl  _C_LABEL(bootdev)
+#endif
+       .globl  _C_LABEL(proc0paddr),_C_LABEL(PTDpaddr)
+       .globl  _C_LABEL(biosbasemem),_C_LABEL(biosextmem)
+       .globl  _C_LABEL(gdt)
+#ifdef I586_CPU
+       .globl  _C_LABEL(idt)
+#endif
+       .globl  _C_LABEL(lapic_tpr)     
+       
+#if NLAPIC > 0
+#ifdef __ELF__
+       .align  PAGE_SIZE
+#else
+       .align  12
+#endif
+       .globl _C_LABEL(local_apic), _C_LABEL(lapic_id)
+_C_LABEL(local_apic):
+       .space  LAPIC_ID
+_C_LABEL(lapic_id):    
+       .long   0x00000000
+       .space  LAPIC_TPRI-(LAPIC_ID+4)
+_C_LABEL(lapic_tpr):           
+       .space  LAPIC_PPRI-LAPIC_TPRI
+_C_LABEL(lapic_ppr):           
+       .space  LAPIC_ISR-LAPIC_PPRI
+_C_LABEL(lapic_isr):
+       .space  PAGE_SIZE-LAPIC_ISR
+#else
+_C_LABEL(lapic_tpr):   
+       .long 0
+#endif
+       
+
+_C_LABEL(cpu):         .long   0       # are we 386, 386sx, or 486,
+                                       #   or Pentium, or..
+_C_LABEL(esym):                .long   0       # ptr to end of syms
+_C_LABEL(atdevbase):   .long   0       # location of start of iomem in virtual
+_C_LABEL(proc0paddr):  .long   0
+_C_LABEL(PTDpaddr):    .long   0       # paddr of PTD, for libkvm
+#ifndef REALBASEMEM
+_C_LABEL(biosbasemem): .long   0       # base memory reported by BIOS
+#else
+_C_LABEL(biosbasemem): .long   REALBASEMEM
+#endif
+#ifndef REALEXTMEM
+_C_LABEL(biosextmem):  .long   0       # extended memory reported by BIOS
+#else
+_C_LABEL(biosextmem):  .long   REALEXTMEM
+#endif
+
+#include <machine/xen.h>
+#define __HYPERVISOR_yield                8
+
+       .space 512
+tmpstk:
+       .long tmpstk, __KERNEL_DS
+
+
+#define        _RELOC(x)       ((x))
+#define        RELOC(x)        _RELOC(_C_LABEL(x))
+
+/* XXX assym.h */
+#define MOD_START   48
+#define MOD_LEN     56
+/* XXX assym.h */
+
+       .text
+       .globl  _C_LABEL(kernel_text)
+       .set    _C_LABEL(kernel_text),KERNTEXTOFF
+
+       .globl  start
+start:
+       cld
+
+       lss     tmpstk,%esp             # bootstrap stack end location
+
+       movl    %esi,%ebx               # save start_info pointer
+
+#if (NKSYMS || defined(DDB) || defined(LKM)) && !defined(SYMTAB_SPACE)
+       /* Save the symbol locations. */
+       movl    MOD_START(%ebx),%esi
+       addl    MOD_LEN(%ebx),%esi
+       movl    %esi,RELOC(esym)
+#endif
+
+        /* Clear BSS first so that there are no surprises... */
+       xorl    %eax,%eax
+       movl    $RELOC(__bss_start),%edi
+       movl    $RELOC(_end),%ecx
+       subl    %edi,%ecx
+       rep stosb
+
+       movl    %ebx,RELOC(avail_start)
+
+       /* Copy the necessary stuff from start_info structure. */
+        /* We need to copy shared_info early, so that sti/cli work */
+       movl    %ebx,%esi
+       movl    $RELOC(start_info_union),%edi
+       movl    $128,%ecx
+       rep movsl
+
+       /* (howto, [bootdev], bootinfo, basemem, extmem). */
+       xorl    %eax,%eax
+       movl    %eax,RELOC(boothowto)
+#ifdef COMPAT_OLDBOOT
+       movl    %eax,RELOC(bootdev)
+#endif
+       movl    $0x20000,%eax
+       movl    %eax,RELOC(boothowto)
+
+       /* First, reset the PSL. */
+       pushl   $PSL_MBO
+       popfl
+
+       /* Clear segment registers; always null in proc0. */
+       xorl    %eax,%eax
+       movw    %ax,%fs
+       movw    %ax,%gs
+       decl    %eax
+       movl    %eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL
+
+       xorl    %eax,%eax
+       cpuid
+       movl    %eax,RELOC(cpu_info_primary)+CPU_INFO_LEVEL
+
+/*
+ * Virtual address space of kernel:
+ *
+ * text | data | bss | [syms] | page dir | proc0 kstack 
+ *                           0          1       2      3
+ */
+#define        PROC0PDIR       ((0)              * PAGE_SIZE)
+#define        PROC0STACK      ((1)              * PAGE_SIZE)
+#define        SYSMAP          ((1+UPAGES)       * PAGE_SIZE)
+#define        TABLESIZE       ((1+UPAGES) * PAGE_SIZE) /* + nkpde * PAGE_SIZE */
+
+       /* Find end of kernel image. */
+       movl    RELOC(avail_start),%edi
+       /* Calculate where to start the bootstrap tables. */
+       movl    %edi,%esi
+
+       /*
+        * Calculate the size of the kernel page table directory, and
+        * how many entries it will have.
+        */
+       movl    RELOC(nkpde),%ecx               # get nkpde
+       cmpl    $NKPTP_MIN,%ecx                 # larger than min?
+       jge     1f
+       movl    $NKPTP_MIN,%ecx                 # set at min
+       jmp     2f
+1:     cmpl    $NKPTP_MAX,%ecx                 # larger than max?
+       jle     2f
+       movl    $NKPTP_MAX,%ecx
+2:
+
+       /* Clear memory for bootstrap tables. */
+       shll    $PGSHIFT,%ecx
+       addl    $TABLESIZE,%ecx
+       addl    %esi,%ecx                       # end of tables
+       movl    %ecx,RELOC(gdt)
+       addl    $PAGE_SIZE,%ecx
+       movl    %ecx,RELOC(avail_start)
+       subl    %edi,%ecx                       # size of tables
+       shrl    $2,%ecx
+       xorl    %eax,%eax
+       cld
+       rep
+       stosl
+
+/*
+ * fillkpt
+ *     eax = pte (page frame | control | status)
+ *     ebx = page table address
+ *     ecx = number of pages to map
+ */
+#define        fillkpt         \
+1:     movl    %eax,(%ebx)     ; \
+       addl    $PAGE_SIZE,%eax ; /* increment physical address */ \
+       addl    $4,%ebx         ; /* next pte */ \
+       loop    1b              ;
+
+/*
+ * Build initial page tables.
+ */
+       /* Calculate end of text segment, rounded to a page. */
+       leal    (RELOC(etext)+PGOFSET),%edx
+       andl    $~PGOFSET,%edx
+       
+       /* Skip over the first 1MB. */
+       movl    $KERNTEXTOFF,%eax
+       movl    %eax,%ecx
+       subl    $KERNBASE_LOCORE,%ecx
+       shrl    $PGSHIFT,%ecx
+       leal    (SYSMAP)(%esi,%ecx,4),%ebx
+
+       /* Map the kernel text read-only. */
+       movl    %edx,%ecx
+       subl    %eax,%ecx
+       shrl    $PGSHIFT,%ecx
+       orl     $(PG_V|PG_KR),%eax
+       fillkpt
+
+       /* Map the data, BSS, and bootstrap tables read-write. */
+       movl    RELOC(avail_start),%ecx
+                                                   # end of tables
+       subl    %edx,%ecx                               # subtract end of text
+       shrl    $PGSHIFT,%ecx
+       leal    (PG_V|PG_KW)(%edx),%eax
+       fillkpt
+
+       movl    $0xffffffff,(%ebx)
+       addl    $4,%ebx
+
+/*
+ * Construct a page table directory.
+ */
+       /* Map kernel PDEs. */
+       movl    RELOC(nkpde),%ecx                       # for this many pde s,
+       leal    (PROC0PDIR+PDSLOT_KERN*4)(%esi),%ebx    # kernel pde offset
+       leal    (SYSMAP+PG_V|PG_KW)(%esi),%eax          # pte for KPT in proc 0,
+       fillkpt
+
+       /* Install a PDE recursively mapping page directory as a page table! */
+       leal    (PROC0PDIR+PG_V/*|PG_KW*/)(%esi),%eax   # pte for ptd
+       movl    %eax,(PROC0PDIR+PDSLOT_PTE*4)(%esi)     # recursive PD slot
+
+       /* Save phys. addr of PTD, for libkvm. */
+       movl    %esi,RELOC(PTDpaddr)
+
+       call    xpmap_init
+
+       /* cr0 is 0x8005003b */
+
+       /* Relocate atdevbase. */
+       movl    _C_LABEL(avail_start),%edx
+       movl    %edx,_C_LABEL(HYPERVISOR_shared_info)
+       addl    $PAGE_SIZE,%edx                 # shared_inf
+       movl    %edx,_C_LABEL(atdevbase)
+
+       /* Set up bootstrap stack. */
+       leal    (PROC0STACK)(%esi),%eax
+       movl    %eax,_C_LABEL(proc0paddr)
+       leal    (USPACE-FRAMESIZE)(%eax),%esp
+       subl    $KERNBASE_LOCORE,%esi
+       movl    %esi,PCB_CR3(%eax)      # pcb->pcb_cr3
+       xorl    %ebp,%ebp               # mark end of frames
+
+       movl    _C_LABEL(atdevbase),%eax
+       pushl   %eax
+       call    _C_LABEL(init386)       # wire 386 chip for unix operation
+       addl    $4,%esp
+
+#ifdef SAFARI_FIFO_HACK
+       movb    $5,%al
+       movw    $0x37b,%dx
+       outb    %al,%dx
+       movw    $0x37f,%dx
+       inb     %dx,%al
+       movb    %al,%cl
+
+       orb     $1,%cl
+
+       movb    $5,%al
+       movw    $0x37b,%dx
+       outb    %al,%dx
+       movw    $0x37f,%dx
+       movb    %cl,%al
+       outb    %al,%dx
+#endif /* SAFARI_FIFO_HACK */
+
+       call    _C_LABEL(main)
+
+/*
+ * void proc_trampoline(void);
+ * This is a trampoline function pushed onto the stack of a newly created
+ * process in order to do some additional setup.  The trampoline is entered by
+ * cpu_switch()ing to the process, so we abuse the callee-saved registers used
+ * by cpu_switch() to store the information about the stub to call.
+ * NOTE: This function does not have a normal calling sequence!
+ */
+/* LINTSTUB: Func: void proc_trampoline(void) */
+NENTRY(proc_trampoline)
+#ifdef MULTIPROCESSOR
+       call    _C_LABEL(proc_trampoline_mp)
+#endif
+       movl    $IPL_NONE,CPUVAR(ILEVEL)
+       pushl   %ebx
+       call    *%esi
+       addl    $4,%esp
+       DO_DEFERRED_SWITCH(%eax)
+       INTRFASTEXIT
+       /* NOTREACHED */
+
+/*****************************************************************************/
+#ifdef COMPAT_16
+/*
+ * Signal trampoline; copied to top of user stack.
+ */
+/* LINTSTUB: Var: char sigcode[1], esigcode[1]; */
+NENTRY(sigcode)
+       /*
+        * Handler has returned here as if we called it.  The sigcontext
+        * is on the stack after the 3 args "we" pushed.
+        */
+       leal    12(%esp),%eax           # get pointer to sigcontext
+       movl    %eax,4(%esp)            # put it in the argument slot
+                                       # fake return address already there
+       movl    $SYS_compat_16___sigreturn14,%eax
+       int     $0x80                   # enter kernel with args on stack
+       movl    $SYS_exit,%eax
+       int     $0x80                   # exit if sigreturn fails
+       .globl  _C_LABEL(esigcode)
+_C_LABEL(esigcode):
+#endif
+
+/*****************************************************************************/
+
+/*
+ * The following primitives are used to fill and copy regions of memory.
+ */
+
+/*
+ * XXX No section 9 man page for fillw.
+ * fillw seems to be very sparsely used (only in pccons it seems.)
+ * One wonders if it couldn't be done without.
+ * -- Perry Metzger, May 7, 2001
+ */
+/*
+ * void fillw(short pattern, void *addr, size_t len);
+ * Write len copies of pattern at addr.
+ */
+/* LINTSTUB: Func: void fillw(short pattern, void *addr, size_t len) */
+ENTRY(fillw)
+       pushl   %edi
+       movl    8(%esp),%eax
+       movl    12(%esp),%edi
+       movw    %ax,%cx
+       rorl    $16,%eax
+       movw    %cx,%ax
+       cld
+       movl    16(%esp),%ecx
+       shrl    %ecx                    # do longwords
+       rep
+       stosl
+       movl    16(%esp),%ecx
+       andl    $1,%ecx                 # do remainder
+       rep
+       stosw
+       popl    %edi
+       ret
+
+/*
+ * int kcopy(const void *from, void *to, size_t len);
+ * Copy len bytes, abort on fault.
+ */
+/* LINTSTUB: Func: int kcopy(const void *from, void *to, size_t len) */
+ENTRY(kcopy)
+       pushl   %esi
+       pushl   %edi
+       GET_CURPCB(%eax)                # load curpcb into eax and set on-fault
+       pushl   PCB_ONFAULT(%eax)
+       movl    $_C_LABEL(kcopy_fault), PCB_ONFAULT(%eax)
+
+       movl    16(%esp),%esi
+       movl    20(%esp),%edi
+       movl    24(%esp),%ecx
+       movl    %edi,%eax
+       subl    %esi,%eax
+       cmpl    %ecx,%eax               # overlapping?
+       jb      1f
+       cld                             # nope, copy forward
+       shrl    $2,%ecx                 # copy by 32-bit words
+       rep
+       movsl
+       movl    24(%esp),%ecx
+       andl    $3,%ecx                 # any bytes left?
+       rep
+       movsb
+
+       GET_CURPCB(%edx)                # XXX save curpcb?
+       popl    PCB_ONFAULT(%edx)
+       popl    %edi
+       popl    %esi
+       xorl    %eax,%eax
+       ret
+
+       ALIGN_TEXT
+1:     addl    %ecx,%edi               # copy backward
+       addl    %ecx,%esi
+       std
+       andl    $3,%ecx                 # any fractional bytes?
+       decl    %edi
+       decl    %esi
+       rep
+       movsb
+       movl    24(%esp),%ecx           # copy remainder by 32-bit words
+       shrl    $2,%ecx
+       subl    $3,%esi
+       subl    $3,%edi
+       rep
+       movsl
+       cld
+
+       GET_CURPCB(%edx)
+       popl    PCB_ONFAULT(%edx)
+       popl    %edi
+       popl    %esi
+       xorl    %eax,%eax
+       ret
+
+/*****************************************************************************/
+
+/*
+ * The following primitives are used to copy data in and out of the user's
+ * address space.
+ */
+
+/*
+ * Default to the lowest-common-denominator.  We will improve it
+ * later.
+ */
+#if defined(I386_CPU)
+#define        DEFAULT_COPYOUT         _C_LABEL(i386_copyout)
+#define        DEFAULT_COPYIN          _C_LABEL(i386_copyin)
+#elif defined(I486_CPU)
+#define        DEFAULT_COPYOUT         _C_LABEL(i486_copyout)
+#define        DEFAULT_COPYIN          _C_LABEL(i386_copyin)
+#elif defined(I586_CPU)
+#define        DEFAULT_COPYOUT         _C_LABEL(i486_copyout)  /* XXX */
+#define        DEFAULT_COPYIN          _C_LABEL(i386_copyin)   /* XXX */
+#elif defined(I686_CPU)
+#define        DEFAULT_COPYOUT         _C_LABEL(i486_copyout)  /* XXX */
+#define        DEFAULT_COPYIN          _C_LABEL(i386_copyin)   /* XXX */
+#endif
+
+       .data
+
+       .globl  _C_LABEL(copyout_func)
+_C_LABEL(copyout_func):
+       .long   DEFAULT_COPYOUT
+
+       .globl  _C_LABEL(copyin_func)
+_C_LABEL(copyin_func):
+       .long   DEFAULT_COPYIN
+
+       .text
+
+/*
+ * int copyout(const void *from, void *to, size_t len);
+ * Copy len bytes into the user's address space.
+ * see copyout(9)
+ */
+/* LINTSTUB: Func: int copyout(const void *kaddr, void *uaddr, size_t len) */
+ENTRY(copyout)
+       DO_DEFERRED_SWITCH(%eax)
+       jmp     *_C_LABEL(copyout_func)
+
+#if defined(I386_CPU)
+/* LINTSTUB: Func: int i386_copyout(const void *kaddr, void *uaddr, size_t len) */
+ENTRY(i386_copyout)
+       pushl   %esi
+       pushl   %edi
+       pushl   $0
+       
+       movl    16(%esp),%esi
+       movl    20(%esp),%edi
+       movl    24(%esp),%eax
+
+       /*
+        * We check that the end of the destination buffer is not past the end
+        * of the user's address space.  If it's not, then we only need to
+        * check that each page is writable.  The 486 will do this for us; the
+        * 386 will not.  (We assume that pages in user space that are not
+        * writable by the user are not writable by the kernel either.)
+        */
+       movl    %edi,%edx
+       addl    %eax,%edx
+       jc      _C_LABEL(copy_efault)
+       cmpl    $VM_MAXUSER_ADDRESS,%edx
+       ja      _C_LABEL(copy_efault)
+
+       testl   %eax,%eax               # anything to do?
+       jz      3f
+
+       /*
+        * We have to check each PTE for (write) permission, since the CPU
+        * doesn't do it for us.
+        */
+
+       /* Compute number of pages. */
+       movl    %edi,%ecx
+       andl    $PGOFSET,%ecx
+       addl    %eax,%ecx
+       decl    %ecx
+       shrl    $PGSHIFT,%ecx
+
+       /* Compute PTE offset for start address. */
+       shrl    $PGSHIFT,%edi
+
+       GET_CURPCB(%edx)
+       movl    $2f,PCB_ONFAULT(%edx)
+
+1:     /* Check PTE for each page. */
+       testb   $PG_RW,_C_LABEL(PTmap)(,%edi,4)
+       jz      2f
+       
+4:     incl    %edi
+       decl    %ecx
+       jns     1b
+
+       movl    20(%esp),%edi
+       movl    24(%esp),%eax
+       jmp     3f
+       
+2:     /* Simulate a trap. */
+       pushl   %ecx
+       movl    %edi,%eax
+       shll    $PGSHIFT,%eax
+       pushl   %eax
+       call    _C_LABEL(trapwrite)     # trapwrite(addr)
+       addl    $4,%esp                 # pop argument
+       popl    %ecx
+       testl   %eax,%eax               # if not ok, return EFAULT
+       jz      4b
+       jmp     _C_LABEL(copy_efault)
+
+3:     GET_CURPCB(%edx)
+       movl    $_C_LABEL(copy_fault),PCB_ONFAULT(%edx)
+
+       /* bcopy(%esi, %edi, %eax); */
+       cld
+       movl    %eax,%ecx
+       shrl    $2,%ecx
+       rep
+       movsl
+       movl    %eax,%ecx
+       andl    $3,%ecx
+       rep
+       movsb
+
+       popl    PCB_ONFAULT(%edx)
+       popl    %edi
+       popl    %esi
+       xorl    %eax,%eax
+       ret
+#endif /* I386_CPU */
+
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+/* LINTSTUB: Func: int i486_copyout(const void *kaddr, void *uaddr, size_t len) */
+ENTRY(i486_copyout)
+       pushl   %esi
+       pushl   %edi
+       pushl   $0
+       
+       movl    16(%esp),%esi
+       movl    20(%esp),%edi
+       movl    24(%esp),%eax
+
+       /*
+        * We check that the end of the destination buffer is not past the end
+        * of the user's address space.
+        */
+       movl    %edi,%edx
+       addl    %eax,%edx
+       jc      _C_LABEL(copy_efault)
+       cmpl    $VM_MAXUSER_ADDRESS,%edx
+       ja      _C_LABEL(copy_efault)
+
+       GET_CURPCB(%edx)
+       movl    $_C_LABEL(copy_fault),PCB_ONFAULT(%edx)
+
+       /* bcopy(%esi, %edi, %eax); */
+       cld
+       movl    %eax,%ecx
+       shrl    $2,%ecx
+       rep
+       movsl
+       movl    %eax,%ecx
+       andl    $3,%ecx
+       rep
+       movsb
+
+       popl    PCB_ONFAULT(%edx)
+       popl    %edi
+       popl    %esi
+       xorl    %eax,%eax
+       ret
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+/*
+ * int copyin(const void *from, void *to, size_t len);
+ * Copy len bytes from the user's address space.
+ * see copyin(9)
+ */
+/* LINTSTUB: Func: int copyin(const void *uaddr, void *kaddr, size_t len) */
+ENTRY(copyin)
+       DO_DEFERRED_SWITCH(%eax)
+       jmp     *_C_LABEL(copyin_func)
+
+#if defined(I386_CPU) || defined(I486_CPU) || defined(I586_CPU) || \
+    defined(I686_CPU)
+/* LINTSTUB: Func: int i386_copyin(const void *uaddr, void *kaddr, size_t len) */
+ENTRY(i386_copyin)
+       pushl   %esi
+       pushl   %edi
+       GET_CURPCB(%eax)
+       pushl   $0
+       movl    $_C_LABEL(copy_fault),PCB_ONFAULT(%eax)
+       
+       movl    16(%esp),%esi
+       movl    20(%esp),%edi
+       movl    24(%esp),%eax
+
+       /*
+        * We check that the end of the destination buffer is not past the end
+        * of the user's address space.  If it's not, then we only need to
+        * check that each page is readable, and the CPU will do that for us.
+        */
+       movl    %esi,%edx
+       addl    %eax,%edx
+       jc      _C_LABEL(copy_efault)
+       cmpl    $VM_MAXUSER_ADDRESS,%edx
+       ja      _C_LABEL(copy_efault)
+
+       /* bcopy(%esi, %edi, %eax); */
+       cld
+       movl    %eax,%ecx
+       shrl    $2,%ecx
+       rep
+       movsl
+       movl    %eax,%ecx
+       andl    $3,%ecx
+       rep
+       movsb
+
+       GET_CURPCB(%edx)
+       popl    PCB_ONFAULT(%edx)
+       popl    %edi
+       popl    %esi
+       xorl    %eax,%eax
+       ret
+#endif /* I386_CPU || I486_CPU || I586_CPU || I686_CPU */
+
+/* LINTSTUB: Ignore */
+NENTRY(copy_efault)
+       movl    $EFAULT,%eax
+
+/*
+ * kcopy_fault is used by kcopy and copy_fault is used by copyin/out.
+ *
+ * they're distinguished for lazy pmap switching.  see trap().
+ */
+/* LINTSTUB: Ignore */
+NENTRY(kcopy_fault)
+       GET_CURPCB(%edx)
+       popl    PCB_ONFAULT(%edx)
+       popl    %edi
+       popl    %esi
+       ret
+
+/* LINTSTUB: Ignore */
+NENTRY(copy_fault)
+       GET_CURPCB(%edx)
+       popl    PCB_ONFAULT(%edx)
+       popl    %edi
+       popl    %esi
+       ret
+
+/*
+ * int copyoutstr(const void *from, void *to, size_t maxlen, size_t *lencopied);
+ * Copy a NUL-terminated string, at most maxlen characters long, into the
+ * user's address space.  Return the number of characters copied (including the
+ * NUL) in *lencopied.  If the string is too long, return ENAMETOOLONG; else
+ * return 0 or EFAULT.
+ * see copyoutstr(9)
+ */
+/* LINTSTUB: Func: int copyoutstr(const void *kaddr, void *uaddr, size_t len, size_t *done) */
+ENTRY(copyoutstr)
+       pushl   %esi
+       pushl   %edi
+
+       DO_DEFERRED_SWITCH(%eax)
+
+       movl    12(%esp),%esi           # esi = from
+       movl    16(%esp),%edi           # edi = to
+       movl    20(%esp),%edx           # edx = maxlen
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+       cmpl    $CPUCLASS_386,_C_LABEL(cpu_class)
+       jne     5f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+       /* Compute number of bytes in first page. */
+       movl    %edi,%eax
+       andl    $PGOFSET,%eax
+       movl    $PAGE_SIZE,%ecx
+       subl    %eax,%ecx               # ecx = PAGE_SIZE - (src % PAGE_SIZE)
+
+       GET_CURPCB(%eax)
+       movl    $6f,PCB_ONFAULT(%eax)
+
+1:     /*
+        * Once per page, check that we are still within the bounds of user
+        * space, and check for a write fault.
+        */
+       cmpl    $VM_MAXUSER_ADDRESS,%edi
+       jae     _C_LABEL(copystr_efault)
+
+       /* Compute PTE offset. */
+       movl    %edi,%eax
+       shrl    $PGSHIFT,%eax           # calculate pte address
+
+       testb   $PG_RW,_C_LABEL(PTmap)(,%eax,4)
+       jnz     2f
+
+6:     /* Simulate a trap. */
+       pushl   %edx
+       pushl   %edi
+       call    _C_LABEL(trapwrite)     # trapwrite(addr)
+       addl    $4,%esp                 # clear argument from stack
+       popl    %edx
+       testl   %eax,%eax
+       jnz     _C_LABEL(copystr_efault)
+
+2:     /* Copy up to end of this page. */
+       subl    %ecx,%edx               # predecrement total count
+       jnc     3f
+       addl    %edx,%ecx               # ecx += (edx - ecx) = edx
+       xorl    %edx,%edx
+
+3:     decl    %ecx
+       js      4f
+       lodsb
+       stosb
+       testb   %al,%al
+       jnz     3b
+
+       /* Success -- 0 byte reached. */
+       addl    %ecx,%edx               # add back residual for this page
+       xorl    %eax,%eax
+       jmp     copystr_return
+
+4:     /* Go to next page, if any. */
+       movl    $PAGE_SIZE,%ecx
+       testl   %edx,%edx
+       jnz     1b
+
+       /* edx is zero -- return ENAMETOOLONG. */
+       movl    $ENAMETOOLONG,%eax
+       jmp     copystr_return
+#endif /* I386_CPU */
+
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+5:     GET_CURPCB(%eax)
+       movl    $_C_LABEL(copystr_fault),PCB_ONFAULT(%eax)
+       /*
+        * Get min(%edx, VM_MAXUSER_ADDRESS-%edi).
+        */
+       movl    $VM_MAXUSER_ADDRESS,%eax
+       subl    %edi,%eax
+       cmpl    %edx,%eax
+       jae     1f
+       movl    %eax,%edx
+       movl    %eax,20(%esp)
+
+1:     incl    %edx
+       cld
+
+1:     decl    %edx
+       jz      2f
+       lodsb
+       stosb
+       testb   %al,%al
+       jnz     1b
+
+       /* Success -- 0 byte reached. */
+       decl    %edx
+       xorl    %eax,%eax
+       jmp     copystr_return
+
+2:     /* edx is zero -- return EFAULT or ENAMETOOLONG. */
+       cmpl    $VM_MAXUSER_ADDRESS,%edi
+       jae     _C_LABEL(copystr_efault)
+       movl    $ENAMETOOLONG,%eax
+       jmp     copystr_return
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+/*
+ * int copyinstr(const void *from, void *to, size_t maxlen, size_t *lencopied);
+ * Copy a NUL-terminated string, at most maxlen characters long, from the
+ * user's address space.  Return the number of characters copied (including the
+ * NUL) in *lencopied.  If the string is too long, return ENAMETOOLONG; else
+ * return 0 or EFAULT.
+ * see copyinstr(9)
+ */
+/* LINTSTUB: Func: int copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done) */
+ENTRY(copyinstr)
+       pushl   %esi
+       pushl   %edi
+
+       DO_DEFERRED_SWITCH(%eax)
+
+       GET_CURPCB(%ecx)
+       movl    $_C_LABEL(copystr_fault),PCB_ONFAULT(%ecx)
+
+       movl    12(%esp),%esi           # %esi = from
+       movl    16(%esp),%edi           # %edi = to
+       movl    20(%esp),%edx           # %edx = maxlen
+
+       /*
+        * Get min(%edx, VM_MAXUSER_ADDRESS-%esi).
+        */
+       movl    $VM_MAXUSER_ADDRESS,%eax
+       subl    %esi,%eax
+       cmpl    %edx,%eax
+       jae     1f
+       movl    %eax,%edx
+       movl    %eax,20(%esp)
+
+1:     incl    %edx
+       cld
+
+1:     decl    %edx
+       jz      2f
+       lodsb
+       stosb
+       testb   %al,%al
+       jnz     1b
+
+       /* Success -- 0 byte reached. */
+       decl    %edx
+       xorl    %eax,%eax
+       jmp     copystr_return
+
+2:     /* edx is zero -- return EFAULT or ENAMETOOLONG. */
+       cmpl    $VM_MAXUSER_ADDRESS,%esi
+       jae     _C_LABEL(copystr_efault)
+       movl    $ENAMETOOLONG,%eax
+       jmp     copystr_return
+
+/* LINTSTUB: Ignore */
+NENTRY(copystr_efault)
+       movl    $EFAULT,%eax
+
+/* LINTSTUB: Ignore */
+NENTRY(copystr_fault)
+copystr_return:
+       /* Set *lencopied and return %eax. */
+       GET_CURPCB(%ecx)
+       movl    $0,PCB_ONFAULT(%ecx)
+       movl    20(%esp),%ecx
+       subl    %edx,%ecx
+       movl    24(%esp),%edx
+       testl   %edx,%edx
+       jz      8f
+       movl    %ecx,(%edx)
+
+8:     popl    %edi
+       popl    %esi
+       ret
+
+/*
+ * int copystr(const void *from, void *to, size_t maxlen, size_t *lencopied);
+ * Copy a NUL-terminated string, at most maxlen characters long.  Return the
+ * number of characters copied (including the NUL) in *lencopied.  If the
+ * string is too long, return ENAMETOOLONG; else return 0.
+ * see copystr(9)
+ */
+/* LINTSTUB: Func: int copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done) */
+ENTRY(copystr)
+       pushl   %esi
+       pushl   %edi
+
+       movl    12(%esp),%esi           # esi = from
+       movl    16(%esp),%edi           # edi = to
+       movl    20(%esp),%edx           # edx = maxlen
+       incl    %edx
+       cld
+
+1:     decl    %edx
+       jz      4f
+       lodsb
+       stosb
+       testb   %al,%al
+       jnz     1b
+
+       /* Success -- 0 byte reached. */
+       decl    %edx
+       xorl    %eax,%eax
+       jmp     6f
+
+4:     /* edx is zero -- return ENAMETOOLONG. */
+       movl    $ENAMETOOLONG,%eax
+
+6:     /* Set *lencopied and return %eax. */
+       movl    20(%esp),%ecx
+       subl    %edx,%ecx
+       movl    24(%esp),%edx
+       testl   %edx,%edx
+       jz      7f
+       movl    %ecx,(%edx)
+
+7:     popl    %edi
+       popl    %esi
+       ret
+
+/*
+ * long fuword(const void *uaddr);
+ * Fetch an int from the user's address space.
+ * see fuword(9)
+ */
+/* LINTSTUB: Func: long fuword(const void *base) */
+ENTRY(fuword)
+       DO_DEFERRED_SWITCH(%eax)
+       movl    4(%esp),%edx
+       cmpl    $VM_MAXUSER_ADDRESS-4,%edx
+       ja      _C_LABEL(fusuaddrfault)
+       GET_CURPCB(%ecx)
+       movl    $_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+       movl    (%edx),%eax
+       movl    $0,PCB_ONFAULT(%ecx)
+       ret
+       
+/*
+ * int fusword(const void *uaddr);
+ * Fetch a short from the user's address space.
+ * see fusword(9)
+ */
+/* LINTSTUB: Func: int fusword(const void *base) */
+ENTRY(fusword)
+       DO_DEFERRED_SWITCH(%eax)
+       movl    4(%esp),%edx
+       cmpl    $VM_MAXUSER_ADDRESS-2,%edx
+       ja      _C_LABEL(fusuaddrfault)
+       GET_CURPCB(%ecx)
+       movl    $_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+       movzwl  (%edx),%eax
+       movl    $0,PCB_ONFAULT(%ecx)
+       ret
+       
+/*
+ * int fuswintr(const void *uaddr);
+ * Fetch a short from the user's address space.  Can be called during an
+ * interrupt.
+ * see fuswintr(9)
+ */
+/* LINTSTUB: Func: int fuswintr(const void *base) */
+ENTRY(fuswintr)
+       cmpl    $TLBSTATE_VALID, CPUVAR(TLBSTATE)
+       jnz     _C_LABEL(fusuaddrfault)
+       movl    4(%esp),%edx
+       cmpl    $VM_MAXUSER_ADDRESS-2,%edx
+       ja      _C_LABEL(fusuaddrfault)
+       movl    CPUVAR(CURLWP),%ecx
+       movl    L_ADDR(%ecx),%ecx
+       movl    $_C_LABEL(fusubail),PCB_ONFAULT(%ecx)
+       movzwl  (%edx),%eax
+       movl    $0,PCB_ONFAULT(%ecx)
+       ret
+       
+/*
+ * int fubyte(const void *uaddr);
+ * Fetch a byte from the user's address space.
+ * see fubyte(9)
+ */
+/* LINTSTUB: Func: int fubyte(const void *base) */
+ENTRY(fubyte)
+       DO_DEFERRED_SWITCH(%eax)
+       movl    4(%esp),%edx
+       cmpl    $VM_MAXUSER_ADDRESS-1,%edx
+       ja      _C_LABEL(fusuaddrfault)
+       GET_CURPCB(%ecx)
+       movl    $_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+       movzbl  (%edx),%eax
+       movl    $0,PCB_ONFAULT(%ecx)
+       ret
+
+/*
+ * Handle faults from [fs]u*().  Clean up and return -1.
+ */
+/* LINTSTUB: Ignore */
+NENTRY(fusufault)
+       movl    $0,PCB_ONFAULT(%ecx)
+       movl    $-1,%eax
+       ret
+
+/*
+ * Handle faults from [fs]u*().  Clean up and return -1.  This differs from
+ * fusufault() in that trap() will recognize it and return immediately rather
+ * than trying to page fault.
+ */
+/* LINTSTUB: Ignore */
+NENTRY(fusubail)
+       movl    $0,PCB_ONFAULT(%ecx)
+       movl    $-1,%eax
+       ret
+
+/*
+ * Handle earlier faults from [fs]u*(), due to our of range addresses.
+ */
+/* LINTSTUB: Ignore */
+NENTRY(fusuaddrfault)
+       movl    $-1,%eax
+       ret
+
+/*
+ * int suword(void *uaddr, long x);
+ * Store an int in the user's address space.
+ * see suword(9)
+ */
+/* LINTSTUB: Func: int suword(void *base, long c) */
+ENTRY(suword)
+       DO_DEFERRED_SWITCH(%eax)
+       movl    4(%esp),%edx
+       cmpl    $VM_MAXUSER_ADDRESS-4,%edx
+       ja      _C_LABEL(fusuaddrfault)
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+       cmpl    $CPUCLASS_386,_C_LABEL(cpu_class)
+       jne     2f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+       GET_CURPCB(%eax)
+       movl    $3f,PCB_ONFAULT(%eax)
+
+       movl    %edx,%eax
+       shrl    $PGSHIFT,%eax           # calculate pte address
+       testb   $PG_RW,_C_LABEL(PTmap)(,%eax,4)
+       jnz     1f
+
+3:     /* Simulate a trap. */
+       pushl   %edx
+       pushl   %edx
+       call    _C_LABEL(trapwrite)     # trapwrite(addr)
+       addl    $4,%esp                 # clear parameter from the stack
+       popl    %edx
+       GET_CURPCB(%ecx)
+       testl   %eax,%eax
+       jnz     _C_LABEL(fusufault)
+
+1:     /* XXX also need to check the following 3 bytes for validity! */
+#endif
+
+2:     GET_CURPCB(%ecx)
+       movl    $_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+
+       movl    8(%esp),%eax
+       movl    %eax,(%edx)
+       xorl    %eax,%eax
+       movl    %eax,PCB_ONFAULT(%ecx)
+       ret
+       
+/*
+ * int susword(void *uaddr, short x);
+ * Store a short in the user's address space.
+ * see susword(9)
+ */
+/* LINTSTUB: Func: int susword(void *base, short c) */
+ENTRY(susword)
+       DO_DEFERRED_SWITCH(%eax)
+       movl    4(%esp),%edx
+       cmpl    $VM_MAXUSER_ADDRESS-2,%edx
+       ja      _C_LABEL(fusuaddrfault)
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+       cmpl    $CPUCLASS_386,_C_LABEL(cpu_class)
+       jne     2f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+       GET_CURPCB(%eax)
+       movl    $3f,PCB_ONFAULT(%eax)
+
+       movl    %edx,%eax
+       shrl    $PGSHIFT,%eax           # calculate pte address
+       testb   $PG_RW,_C_LABEL(PTmap)(,%eax,4)
+       jnz     1f
+
+3:     /* Simulate a trap. */
+       pushl   %edx
+       pushl   %edx
+       call    _C_LABEL(trapwrite)     # trapwrite(addr)
+       addl    $4,%esp                 # clear parameter from the stack
+       popl    %edx
+       GET_CURPCB(%ecx)
+       testl   %eax,%eax
+       jnz     _C_LABEL(fusufault)
+
+1:     /* XXX also need to check the following byte for validity! */
+#endif
+
+2:     GET_CURPCB(%ecx)
+       movl    $_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+
+       movl    8(%esp),%eax
+       movw    %ax,(%edx)
+       xorl    %eax,%eax
+       movl    %eax,PCB_ONFAULT(%ecx)
+       ret
+
+/*
+ * int suswintr(void *uaddr, short x);
+ * Store a short in the user's address space.  Can be called during an
+ * interrupt.
+ * see suswintr(9)
+ */
+/* LINTSTUB: Func: int suswintr(void *base, short c) */
+ENTRY(suswintr)
+       cmpl    $TLBSTATE_VALID, CPUVAR(TLBSTATE)
+       jnz     _C_LABEL(fusuaddrfault)
+       movl    4(%esp),%edx
+       cmpl    $VM_MAXUSER_ADDRESS-2,%edx
+       ja      _C_LABEL(fusuaddrfault)
+       movl    CPUVAR(CURLWP),%ecx
+       movl    L_ADDR(%ecx),%ecx
+       movl    $_C_LABEL(fusubail),PCB_ONFAULT(%ecx)
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+       cmpl    $CPUCLASS_386,_C_LABEL(cpu_class)
+       jne     2f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+       movl    %edx,%eax
+       shrl    $PGSHIFT,%eax           # calculate pte address
+       testb   $PG_RW,_C_LABEL(PTmap)(,%eax,4)
+       jnz     1f
+
+       /* Simulate a trap. */
+       jmp     _C_LABEL(fusubail)
+
+1:     /* XXX also need to check the following byte for validity! */
+#endif
+
+2:     movl    8(%esp),%eax
+       movw    %ax,(%edx)
+       xorl    %eax,%eax
+       movl    %eax,PCB_ONFAULT(%ecx)
+       ret
+
+/*
+ * int subyte(void *uaddr, char x);
+ * Store a byte in the user's address space.
+ * see subyte(9)
+ */
+/* LINTSTUB: Func: int subyte(void *base, int c) */
+ENTRY(subyte)
+       DO_DEFERRED_SWITCH(%eax)
+       movl    4(%esp),%edx
+       cmpl    $VM_MAXUSER_ADDRESS-1,%edx
+       ja      _C_LABEL(fusuaddrfault)
+
+#if defined(I386_CPU)
+#if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
+       cmpl    $CPUCLASS_386,_C_LABEL(cpu_class)
+       jne     2f
+#endif /* I486_CPU || I586_CPU || I686_CPU */
+
+       GET_CURPCB(%eax)        
+       movl    $3f,PCB_ONFAULT(%eax)
+
+       movl    %edx,%eax
+       shrl    $PGSHIFT,%eax           # calculate pte address
+       testb   $PG_RW,_C_LABEL(PTmap)(,%eax,4)
+       jnz     1f
+
+3:     /* Simulate a trap. */
+       pushl   %edx
+       pushl   %edx
+       call    _C_LABEL(trapwrite)     # trapwrite(addr)
+       addl    $4,%esp                 # clear parameter from the stack
+       popl    %edx
+       GET_CURPCB(%ecx)
+       testl   %eax,%eax
+       jnz     _C_LABEL(fusufault)
+
+1:
+#endif
+
+2:     GET_CURPCB(%ecx)
+       movl    $_C_LABEL(fusufault),PCB_ONFAULT(%ecx)
+
+       movb    8(%esp),%al
+       movb    %al,(%edx)
+       xorl    %eax,%eax
+       movl    %eax,PCB_ONFAULT(%ecx)
+       ret
+
+/*****************************************************************************/
+
+/*
+ * The following is i386-specific nonsense.
+ */
+
+/*
+ * void lgdt_finish(void);
+ * Finish load a new GDT pointer (do any necessary cleanup).
+ * XXX It's somewhat questionable whether reloading all the segment registers
+ * is necessary, since the actual descriptor data is not changed except by
+ * process creation and exit, both of which clean up via task switches.  OTOH,
+ * this only happens at run time when the GDT is resized.
+ */
+/* LINTSTUB: Func: void lgdt_finish(void) */
+NENTRY(lgdt_finish)
+       movl    $GSEL(GDATA_SEL, SEL_KPL),%eax
+       movw    %ax,%ds
+       movw    %ax,%es
+       movw    %ax,%gs
+       movw    %ax,%ss
+       movl    $GSEL(GCPU_SEL, SEL_KPL),%eax
+       movw    %ax,%fs
+       /* Reload code selector by doing intersegment return. */
+       popl    %eax
+       pushl   $GSEL(GCODE_SEL, SEL_KPL)
+       pushl   %eax
+       lret
+
+/*****************************************************************************/
+
+/*
+ * These functions are primarily used by DDB.
+ */
+
+/* LINTSTUB: Func: int setjmp (label_t *l) */
+ENTRY(setjmp)
+       movl    4(%esp),%eax
+       movl    %ebx,(%eax)             # save ebx
+       movl    %esp,4(%eax)            # save esp
+       movl    %ebp,8(%eax)            # save ebp
+       movl    %esi,12(%eax)           # save esi
+       movl    %edi,16(%eax)           # save edi
+       movl    (%esp),%edx             # get rta
+       movl    %edx,20(%eax)           # save eip
+       xorl    %eax,%eax               # return (0);
+       ret
+
+/* LINTSTUB: Func: void longjmp (label_t *l) */
+ENTRY(longjmp)
+       movl    4(%esp),%eax
+       movl    (%eax),%ebx             # restore ebx
+       movl    4(%eax),%esp            # restore esp
+       movl    8(%eax),%ebp            # restore ebp
+       movl    12(%eax),%esi           # restore esi
+       movl    16(%eax),%edi           # restore edi
+       movl    20(%eax),%edx           # get rta
+       movl    %edx,(%esp)             # put in return frame
+       xorl    %eax,%eax               # return (1);
+       incl    %eax
+       ret
+
+/*****************************************************************************/
+
+       .globl  _C_LABEL(sched_whichqs),_C_LABEL(sched_qs)
+       .globl  _C_LABEL(uvmexp),_C_LABEL(panic)
+
+#ifdef DIAGNOSTIC
+NENTRY(switch_error)
+       pushl   $1f
+3:     call    _C_LABEL(panic)
+       /* NOTREACHED */
+1:     .asciz  "cpu_switch"
+#endif /* DIAGNOSTIC */
+
+/*
+ * void cpu_switch(struct lwp *)
+ * Find a runnable process and switch to it.  Wait if necessary.  If the new
+ * process is the same as the old one, we short-circuit the context save and
+ * restore.
+ *     
+ * Note that the stack frame layout is known to "struct switchframe"
+ * in <machine/frame.h> and to the code in cpu_fork() which initializes 
+ * it for a new lwp.
+ */
+ENTRY(cpu_switch)
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+
+#ifdef DEBUG
+       cmpl    $IPL_SCHED,CPUVAR(ILEVEL)
+       jae     1f
+       pushl   $2f
+       call    _C_LABEL(panic)
+       /* NOTREACHED */
+2:     .asciz  "not splsched() in cpu_switch!"
+1:     
+#endif /* DEBUG */
+
+       movl    16(%esp),%esi           # current
+
+       /*
+        * Clear curlwp so that we don't accumulate system time while idle.
+        * This also insures that schedcpu() will move the old lwp to
+        * the correct queue if it happens to get called from the spllower()
+        * below and changes the priority.  (See corresponding comment in
+        * userret()).
+        */
+       movl    $0,CPUVAR(CURLWP)
+       /*
+        * First phase: find new lwp.
+        *
+        * Registers:
+        *   %eax - queue head, scratch, then zero
+        *   %ebx - queue number
+        *   %ecx - cached value of whichqs
+        *   %edx - next lwp in queue
+        *   %esi - old lwp
+        *   %edi - new lwp
+        */
+
+       /* Look for new lwp. */
+       CLI(%ecx)                       # splhigh doesn't do a cli
+       movl    _C_LABEL(sched_whichqs),%ecx
+       bsfl    %ecx,%ebx               # find a full q
+       jnz     switch_dequeue
+
+       /*
+        * idling:      save old context.
+        *
+        * Registers:
+        *   %eax, %ecx - scratch
+        *   %esi - old lwp, then old pcb
+        *   %edi - idle pcb
+        */
+
+       pushl   %esi
+       call    _C_LABEL(pmap_deactivate2)      # pmap_deactivate(oldproc)
+       addl    $4,%esp
+
+       movl    L_ADDR(%esi),%esi
+
+       /* Save stack pointers. */
+       movl    %esp,PCB_ESP(%esi)
+       movl    %ebp,PCB_EBP(%esi)
+
+       /* Find idle PCB for this CPU */
+#ifndef MULTIPROCESSOR
+       movl    $_C_LABEL(lwp0),%ebx
+       movl    L_ADDR(%ebx),%edi
+       movl    L_MD_TSS_SEL(%ebx),%edx
+#else
+       movl    CPUVAR(IDLE_PCB),%edi
+       movl    CPUVAR(IDLE_TSS_SEL),%edx
+#endif
+       movl    $0,CPUVAR(CURLWP)               /* In case we fault... */
+
+       /* Restore the idle context (avoid interrupts) */
+       CLI(%ecx)
+
+       /* Restore stack pointers. */
+       movl    PCB_ESP(%edi),%esp
+       movl    PCB_EBP(%edi),%ebp
+
+       pushl   %edi
+       call    _C_LABEL(i386_switch_context)
+       addl    $4,%esp
+
+       /* Record new pcb. */
+       SET_CURPCB(%edi)
+
+       xorl    %esi,%esi
+       STI(%eax)
+idle_unlock:   
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)      
+       call    _C_LABEL(sched_unlock_idle)
+#endif
+       /* Interrupts are okay again. */
+       pushl   $IPL_NONE               # spl0()
+       call    _C_LABEL(Xspllower)     # process pending interrupts
+       addl    $4,%esp
+       jmp     idle_start
+idle_zero:             
+       STIC(%eax)
+       jz      4f
+       call    _C_LABEL(stipending)
+       testl   %eax,%eax
+       jz      4f
+       pushl   $IPL_NONE
+       call    _C_LABEL(Xspllower)
+       addl    $4,%esp
+4:
+       call    _C_LABEL(uvm_pageidlezero)
+       CLI(%eax)
+       cmpl    $0,_C_LABEL(sched_whichqs)
+       jnz     idle_exit
+idle_loop:
+       /* Try to zero some pages. */
+       movl    _C_LABEL(uvm)+UVM_PAGE_IDLE_ZERO,%ecx
+       testl   %ecx,%ecx
+       jnz     idle_zero
+       STIC(%eax)
+       jz      4f
+       call    _C_LABEL(stipending)
+       testl   %eax,%eax
+       jz      4f
+       pushl   $IPL_NONE
+       call    _C_LABEL(Xspllower)
+       addl    $4,%esp
+       jmp     idle_start
+4:
+       movl    $__HYPERVISOR_yield,%eax
+       TRAP_INSTR
+NENTRY(mpidle)
+idle_start:    
+       CLI(%eax)
+       cmpl    $0,_C_LABEL(sched_whichqs)
+       jz      idle_loop
+idle_exit:     
+       movl    $IPL_HIGH,CPUVAR(ILEVEL)                # splhigh
+       STI(%eax)
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)      
+       call    _C_LABEL(sched_lock_idle)
+#endif
+       movl    _C_LABEL(sched_whichqs),%ecx
+       bsfl    %ecx,%ebx
+       jz      idle_unlock
+
+#ifdef XENDEBUG_LOW
+       pushl   %ecx
+       call    _C_LABEL(xen_dbg1)
+       xorl    %ecx,%ecx
+       movl    %ecx,_C_LABEL(xen_once)
+       popl    %ecx
+#endif
+switch_dequeue:                
+       /* 
+        * we're running at splhigh(), but it's otherwise okay to take
+        * interrupts here. 
+        */
+       STI(%edi)
+       leal    _C_LABEL(sched_qs)(,%ebx,8),%eax # select q
+
+       movl    L_FORW(%eax),%edi       # unlink from front of process q
+#ifdef DIAGNOSTIC
+       cmpl    %edi,%eax               # linked to self (i.e. nothing queued)?
+       je      _C_LABEL(switch_error)  # not possible
+#endif /* DIAGNOSTIC */
+       movl    L_FORW(%edi),%edx
+       movl    %edx,L_FORW(%eax)
+       movl    %eax,L_BACK(%edx)
+
+       cmpl    %edx,%eax               # q empty?
+       jne     3f
+
+       btrl    %ebx,%ecx               # yes, clear to indicate empty
+       movl    %ecx,_C_LABEL(sched_whichqs) # update q status
+
+3:     /* We just did it. */
+       xorl    %eax,%eax
+       CLEAR_RESCHED(%eax)
+
+switch_resume:
+#ifdef DIAGNOSTIC
+       cmpl    %eax,L_WCHAN(%edi)      # Waiting for something?
+       jne     _C_LABEL(switch_error)  # Yes; shouldn't be queued.
+       cmpb    $LSRUN,L_STAT(%edi)     # In run state?
+       jne     _C_LABEL(switch_error)  # No; shouldn't be queued.
+#endif /* DIAGNOSTIC */
+
+       /* Isolate lwp.  XXX Is this necessary? */
+       movl    %eax,L_BACK(%edi)
+
+       /* Record new lwp. */
+       movb    $LSONPROC,L_STAT(%edi)  # l->l_stat = LSONPROC
+       SET_CURLWP(%edi,%ecx)
+
+       /* Skip context switch if same lwp. */
+       xorl    %ebx,%ebx
+       cmpl    %edi,%esi
+       je      switch_return
+
+       /* If old lwp exited, don't bother. */
+       testl   %esi,%esi
+       jz      switch_exited
+
+       /*
+        * Second phase: save old context.
+        *
+        * Registers:
+        *   %eax, %ecx - scratch
+        *   %esi - old lwp, then old pcb
+        *   %edi - new lwp
+        */
+
+       pushl   %esi
+       call    _C_LABEL(pmap_deactivate2)      # pmap_deactivate(oldproc)
+       addl    $4,%esp
+
+       movl    L_ADDR(%esi),%esi
+
+       /* Save stack pointers. */
+       movl    %esp,PCB_ESP(%esi)
+       movl    %ebp,PCB_EBP(%esi)
+
+switch_exited:
+       /*
+        * Third phase: restore saved context.
+        *
+        * Registers:
+        *   %eax, %ebx, %ecx, %edx - scratch
+        *   %esi - new pcb
+        *   %edi - new lwp
+        */
+
+       /* No interrupts while loading new state. */
+       CLI(%eax)
+       movl    L_ADDR(%edi),%esi
+
+       /* Restore stack pointers. */
+       movl    PCB_ESP(%esi),%esp
+       movl    PCB_EBP(%esi),%ebp
+
+#if 0
+       /* Don't bother with the rest if switching to a system process. */
+       testl   $P_SYSTEM,L_FLAG(%edi); XXX NJWLWP lwp's don't have P_SYSTEM!
+       jnz     switch_restored ; XXX skip stack_switch+pmap_activate
+#endif
+
+       pushl   %edi
+       call    _C_LABEL(pmap_activate)         # pmap_activate(p)
+       addl    $4,%esp
+
+       pushl   %esi
+       call    _C_LABEL(i386_switch_context)
+       addl    $4,%esp
+
+       /* Record new pcb. */
+       SET_CURPCB(%esi)
+
+       /* Interrupts are okay again. */
+       STI(%edi)
+
+/*
+ *  Check for restartable atomic sequences (RAS)
+ */
+       movl    CPUVAR(CURLWP),%edi
+       movl    L_PROC(%edi),%esi
+       cmpl    $0,P_RASLIST(%esi)
+       jne     2f
+1:
+       movl    $1,%ebx
+
+switch_return:
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)     
+       call    _C_LABEL(sched_unlock_idle)
+#endif
+       pushl   $IPL_NONE               # spl0()
+       call    _C_LABEL(Xspllower)     # process pending interrupts
+       addl    $4,%esp
+       movl    $IPL_HIGH,CPUVAR(ILEVEL)        # splhigh()
+
+       movl    %ebx,%eax
+
+       popl    %edi
+       popl    %esi
+       popl    %ebx
+       ret
+
+2:                                     # check RAS list
+       movl    L_MD_REGS(%edi),%ebx
+       movl    TF_EIP(%ebx),%eax
+       pushl   %eax
+       pushl   %esi
+       call    _C_LABEL(ras_lookup)
+       addl    $8,%esp
+       cmpl    $-1,%eax
+       je      1b
+       movl    %eax,TF_EIP(%ebx)
+       jmp     1b
+
+/*
+ * void cpu_switchto(struct lwp *current, struct lwp *next)
+ * Switch to the specified next LWP.
+ */
+ENTRY(cpu_switchto)
+       pushl   %ebx
+       pushl   %esi
+       pushl   %edi
+
+#ifdef DEBUG
+       cmpl    $IPL_SCHED,CPUVAR(ILEVEL)
+       jae     1f
+       pushl   $2f
+       call    _C_LABEL(panic)
+       /* NOTREACHED */
+2:     .asciz  "not splsched() in cpu_switchto!"
+1:
+#endif /* DEBUG */
+
+       movl    16(%esp),%esi           # current
+       movl    20(%esp),%edi           # next
+
+       /*
+        * Clear curlwp so that we don't accumulate system time while idle.
+        * This also insures that schedcpu() will move the old process to
+        * the correct queue if it happens to get called from the spllower()
+        * below and changes the priority.  (See corresponding comment in
+        * usrret()).
+        *
+        * XXX Is this necessary?  We know we won't go idle.
+        */
+       movl    $0,CPUVAR(CURLWP)
+
+       /*
+        * We're running at splhigh(), but it's otherwise okay to take
+        * interrupts here.
+        */
+       STI(%eax)
+
+       /* Jump into the middle of cpu_switch */
+       xorl    %eax,%eax
+       jmp     switch_resume
+
+/*
+ * void cpu_exit(struct lwp *l)
+ * Switch to the appropriate idle context (lwp0's if uniprocessor; the CPU's 
+ * if multiprocessor) and deallocate the address space and kernel stack for p. 
+ * Then jump into cpu_switch(), as if we were in the idle proc all along.
+ */
+#ifndef MULTIPROCESSOR
+       .globl  _C_LABEL(lwp0)
+#endif
+       .globl  _C_LABEL(uvmspace_free),_C_LABEL(kernel_map)
+       .globl  _C_LABEL(uvm_km_free),_C_LABEL(tss_free)
+/* LINTSTUB: Func: void cpu_exit(struct lwp *l) */
+ENTRY(cpu_exit)
+       movl    4(%esp),%edi            # old process
+#ifndef MULTIPROCESSOR
+       movl    $_C_LABEL(lwp0),%ebx
+       movl    L_ADDR(%ebx),%esi
+       movl    L_MD_TSS_SEL(%ebx),%edx
+#else
+       movl    CPUVAR(IDLE_PCB),%esi
+       movl    CPUVAR(IDLE_TSS_SEL),%edx
+#endif
+       /* In case we fault... */
+       movl    $0,CPUVAR(CURLWP)
+
+       /* Restore the idle context. */
+       CLI(%eax)
+
+       /* Restore stack pointers. */
+       movl    PCB_ESP(%esi),%esp
+       movl    PCB_EBP(%esi),%ebp
+
+       pushl   %esi
+       call    _C_LABEL(i386_switch_context)
+       addl    $4,%esp
+
+       /* Record new pcb. */
+       SET_CURPCB(%esi)
+
+       /* Interrupts are okay again. */
+       STI(%eax)
+
+       /*
+        * Schedule the dead LWP's stack to be freed.
+        */
+       pushl   %edi
+       call    _C_LABEL(lwp_exit2)
+       addl    $4,%esp
+
+       /* Jump into cpu_switch() with the right state. */
+       xorl    %esi,%esi
+       movl    %esi,CPUVAR(CURLWP)
+       jmp     idle_start
+
+/*
+ * void savectx(struct pcb *pcb);
+ * Update pcb, saving current processor state.
+ */
+/* LINTSTUB: Func: void savectx(struct pcb *pcb) */
+ENTRY(savectx)
+       movl    4(%esp),%edx            # edx = p->p_addr
+  
+       /* Save stack pointers. */
+       movl    %esp,PCB_ESP(%edx)
+       movl    %ebp,PCB_EBP(%edx)
+
+       ret
+
+/*
+ * Old call gate entry for syscall
+ */
+/* LINTSTUB: Var: char Xosyscall[1]; */
+IDTVEC(osyscall)
+       /* Set eflags in trap frame. */
+       pushfl
+       popl    8(%esp)
+       pushl   $7              # size of instruction for restart
+       jmp     syscall1
+
+/*
+ * Trap gate entry for syscall
+ */
+/* LINTSTUB: Var: char Xsyscall[1]; */
+IDTVEC(syscall)
+       pushl   $2              # size of instruction for restart
+syscall1:
+       pushl   $T_ASTFLT       # trap # for doing ASTs
+       INTRENTRY
+
+#ifdef DIAGNOSTIC
+       cmpl    $0, CPUVAR(WANT_PMAPLOAD)
+       jz      1f
+       pushl   $6f
+       call    _C_LABEL(printf)
+       addl    $4, %esp
+1:
+       movl    CPUVAR(ILEVEL),%ebx
+       testl   %ebx,%ebx
+       jz      1f
+       pushl   $5f
+       call    _C_LABEL(printf)
+       addl    $4,%esp
+#ifdef DDB
+       int     $3
+#endif
+1:     
+#endif /* DIAGNOSTIC */
+       movl    CPUVAR(CURLWP),%edx
+       movl    %esp,L_MD_REGS(%edx)    # save pointer to frame
+       movl    L_PROC(%edx),%edx
+       pushl   %esp
+       call    *P_MD_SYSCALL(%edx)     # get pointer to syscall() function
+       addl    $4,%esp
+syscall_checkast:
+       /* Check for ASTs on exit to user mode. */
+       CLI(%eax)
+       CHECK_ASTPENDING(%eax)
+       je      1f
+       /* Always returning to user mode here. */
+       CLEAR_ASTPENDING(%eax)
+       STI(%eax)
+       /* Pushed T_ASTFLT into tf_trapno on entry. */
+       pushl   %esp
+       call    _C_LABEL(trap)
+       addl    $4,%esp
+       jmp     syscall_checkast
+1:     STI(%eax)
+       CHECK_DEFERRED_SWITCH(%eax)
+       jnz     9f
+#ifndef DIAGNOSTIC
+       INTRFASTEXIT
+#else /* DIAGNOSTIC */
+       cmpl    $IPL_NONE,CPUVAR(ILEVEL)
+       jne     3f
+       INTRFASTEXIT
+3:     pushl   $4f
+       call    _C_LABEL(printf)
+       addl    $4,%esp
+#ifdef DDB
+       int     $3
+#endif /* DDB */
+       movl    $IPL_NONE,CPUVAR(ILEVEL)
+       jmp     2b
+4:     .asciz  "WARNING: SPL NOT LOWERED ON SYSCALL EXIT\n"
+5:     .asciz  "WARNING: SPL NOT ZERO ON SYSCALL ENTRY\n"      
+6:     .asciz  "WARNING: WANT PMAPLOAD ON SYSCALL ENTRY\n"     
+#endif /* DIAGNOSTIC */
+9:     call    _C_LABEL(pmap_load)
+       jmp     syscall_checkast        /* re-check ASTs */
+
+#if NNPX > 0
+/*
+ * Special interrupt handlers.  Someday intr0-intr15 will be used to count
+ * interrupts.  We'll still need a special exception 16 handler.  The busy
+ * latch stuff in probintr() can be moved to npxprobe().
+ */
+
+/* LINTSTUB: Func: void probeintr(void) */
+NENTRY(probeintr)
+       ss
+       incl    _C_LABEL(npx_intrs_while_probing)
+       pushl   %eax
+       movb    $0x20,%al       # EOI (asm in strings loses cpp features)
+       outb    %al,$0xa0       # IO_ICU2
+       outb    %al,$0x20       # IO_ICU1
+       movb    $0,%al
+       outb    %al,$0xf0       # clear BUSY# latch
+       popl    %eax
+       iret
+
+/* LINTSTUB: Func: void probetrap(void) */
+NENTRY(probetrap)
+       ss
+       incl    _C_LABEL(npx_traps_while_probing)
+       fnclex
+       iret
+
+/* LINTSTUB: Func: int npx586bug1(int a, int b) */
+NENTRY(npx586bug1)
+       fildl   4(%esp)         # x
+       fildl   8(%esp)         # y
+       fld     %st(1)
+       fdiv    %st(1),%st      # x/y
+       fmulp   %st,%st(1)      # (x/y)*y
+       fsubrp  %st,%st(1)      # x-(x/y)*y
+       pushl   $0
+       fistpl  (%esp)
+       popl    %eax
+       ret
+#endif /* NNPX > 0 */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c

new file mode 100644 (file)

index 0000000..61d2898
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c
@@ -0,0 +1,2561 @@
+/*     $NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $       */
+/*     NetBSD: machdep.c,v 1.552 2004/03/24 15:34:49 atatat Exp        */
+
+/*-
+ * Copyright (c) 1996, 1997, 1998, 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
+ * Simulation Facility, NASA Ames Research Center.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *     This product includes software developed by the NetBSD
+ *     Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)machdep.c   7.4 (Berkeley) 6/3/91
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.2.2.1 2004/05/22 15:58:02 he Exp $");
+
+#include "opt_beep.h"
+#include "opt_compat_ibcs2.h"
+#include "opt_compat_mach.h"   /* need to get the right segment def */
+#include "opt_compat_netbsd.h"
+#include "opt_compat_svr4.h"
+#include "opt_cpureset_delay.h"
+#include "opt_cputype.h"
+#include "opt_ddb.h"
+#include "opt_ipkdb.h"
+#include "opt_kgdb.h"
+#include "opt_mtrr.h"
+#include "opt_multiprocessor.h"
+#include "opt_realmem.h"
+#include "opt_user_ldt.h"
+#include "opt_vm86.h"
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/signal.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+#include <sys/exec.h>
+#include <sys/buf.h>
+#include <sys/reboot.h>
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/msgbuf.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/extent.h>
+#include <sys/syscallargs.h>
+#include <sys/core.h>
+#include <sys/kcore.h>
+#include <sys/ucontext.h>
+#include <machine/kcore.h>
+#include <sys/ras.h>
+#include <sys/sa.h>
+#include <sys/savar.h>
+#include <sys/ksyms.h>
+
+#ifdef IPKDB
+#include <ipkdb/ipkdb.h>
+#endif
+
+#ifdef KGDB
+#include <sys/kgdb.h>
+#endif
+
+#include <dev/cons.h>
+
+#include <uvm/uvm_extern.h>
+#include <uvm/uvm_page.h>
+
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#include <machine/cpufunc.h>
+#include <machine/cpuvar.h>
+#include <machine/gdt.h>
+#include <machine/pio.h>
+#include <machine/psl.h>
+#include <machine/reg.h>
+#include <machine/specialreg.h>
+#include <machine/bootinfo.h>
+#include <machine/mtrr.h>
+#include <machine/evtchn.h>
+
+#include <dev/isa/isareg.h>
+#include <machine/isa_machdep.h>
+#include <dev/ic/i8042reg.h>
+
+#ifdef DDB
+#include <machine/db_machdep.h>
+#include <ddb/db_extern.h>
+#endif
+
+#ifdef VM86
+#include <machine/vm86.h>
+#endif
+
+#include "acpi.h"
+#include "apm.h"
+#include "bioscall.h"
+
+#if NBIOSCALL > 0
+#include <machine/bioscall.h>
+#endif
+
+#if NACPI > 0
+#include <dev/acpi/acpivar.h>
+#define ACPI_MACHDEP_PRIVATE
+#include <machine/acpi_machdep.h>
+#endif
+
+#if NAPM > 0
+#include <machine/apmvar.h>
+#endif
+
+#include "isa.h"
+#include "isadma.h"
+#include "npx.h"
+#include "ksyms.h"
+
+#include "mca.h"
+#if NMCA > 0
+#include <machine/mca_machdep.h>       /* for mca_busprobe() */
+#endif
+
+#ifdef MULTIPROCESSOR          /* XXX */
+#include <machine/mpbiosvar.h> /* XXX */
+#endif                         /* XXX */
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+
+#if defined(DDB) || defined(KGDB)
+#include <ddb/db_interface.h>
+#include <ddb/db_output.h>
+
+void ddb_trap_hook(int);
+#endif
+
+/* #define     XENDEBUG */
+/* #define     XENDEBUG_LOW */
+
+#ifdef XENDEBUG
+extern void printk(char *, ...);
+#define        XENPRINTF(x) printf x
+#define        XENPRINTK(x) printk x
+#else
+#define        XENPRINTF(x)
+#define        XENPRINTK(x)
+#endif
+#define        PRINTK(x) printf x
+
+#ifdef XENDEBUG_LOW
+void xen_dbglow_init(void);
+#endif
+
+#ifndef BEEP_ONHALT_COUNT
+#define BEEP_ONHALT_COUNT 3
+#endif
+#ifndef BEEP_ONHALT_PITCH
+#define BEEP_ONHALT_PITCH 1500
+#endif
+#ifndef BEEP_ONHALT_PERIOD
+#define BEEP_ONHALT_PERIOD 250
+#endif
+
+/* the following is used externally (sysctl_hw) */
+char machine[] = "i386";               /* CPU "architecture" */
+char machine_arch[] = "i386";          /* machine == machine_arch */
+
+char bootinfo[BOOTINFO_MAXSIZE];
+
+struct bi_devmatch *i386_alldisks = NULL;
+int i386_ndisks = 0;
+
+#ifdef CPURESET_DELAY
+int    cpureset_delay = CPURESET_DELAY;
+#else
+int     cpureset_delay = 2000; /* default to 2s */
+#endif
+
+#ifdef MTRR
+struct mtrr_funcs *mtrr_funcs;
+#endif
+
+#ifdef COMPAT_NOMID
+static int exec_nomid(struct proc *, struct exec_package *);
+#endif
+
+int    physmem;
+int    dumpmem_low;
+int    dumpmem_high;
+unsigned int cpu_feature;
+int    cpu_class;
+int    i386_fpu_present;
+int    i386_fpu_exception;
+int    i386_fpu_fdivbug;
+
+int    i386_use_fxsave;
+int    i386_has_sse;
+int    i386_has_sse2;
+
+int    tmx86_has_longrun;
+
+vaddr_t        msgbuf_vaddr;
+paddr_t msgbuf_paddr;
+
+vaddr_t        idt_vaddr;
+paddr_t        idt_paddr;
+
+#ifdef I586_CPU
+vaddr_t        pentium_idt_vaddr;
+#endif
+
+struct vm_map *exec_map = NULL;
+struct vm_map *mb_map = NULL;
+struct vm_map *phys_map = NULL;
+
+extern paddr_t avail_start, avail_end;
+extern paddr_t pmap_pa_start, pmap_pa_end;
+
+#ifdef ISA_CLOCK
+void (*delay_func)(int) = i8254_delay;
+void (*microtime_func)(struct timeval *) = i8254_microtime;
+void (*initclock_func)(void) = i8254_initclocks;
+#else
+void (*delay_func)(int) = xen_delay;
+void (*microtime_func)(struct timeval *) = xen_microtime;
+void (*initclock_func)(void) = xen_initclocks;
+#endif
+
+void hypervisor_callback(void);
+void failsafe_callback(void);
+
+/*
+ * Size of memory segments, before any memory is stolen.
+ */
+phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
+int    mem_cluster_cnt;
+
+int    cpu_dump(void);
+int    cpu_dumpsize(void);
+u_long cpu_dump_mempagecnt(void);
+void   dumpsys(void);
+void   init386(paddr_t);
+void   initgdt(void);
+
+#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
+void   add_mem_cluster(u_int64_t, u_int64_t, u_int32_t);
+#endif /* !defnied(REALBASEMEM) && !defined(REALEXTMEM) */
+
+extern int time_adjusted;
+
+/*
+ * Machine-dependent startup code
+ */
+void
+cpu_startup()
+{
+       int x;
+       vaddr_t minaddr, maxaddr;
+       char pbuf[9];
+
+       /*
+        * Initialize error message buffer (et end of core).
+        */
+       msgbuf_vaddr = uvm_km_valloc(kernel_map, x86_round_page(MSGBUFSIZE));
+       if (msgbuf_vaddr == 0)
+               panic("failed to valloc msgbuf_vaddr");
+
+       /* msgbuf_paddr was init'd in pmap */
+       for (x = 0; x < btoc(MSGBUFSIZE); x++)
+               pmap_kenter_pa((vaddr_t)msgbuf_vaddr + x * PAGE_SIZE,
+                   msgbuf_paddr + x * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE);
+       pmap_update(pmap_kernel());
+
+       initmsgbuf((caddr_t)msgbuf_vaddr, round_page(MSGBUFSIZE));
+
+       printf("%s", version);
+
+#ifdef TRAPLOG
+       /*
+        * Enable recording of branch from/to in MSR's
+        */
+       wrmsr(MSR_DEBUGCTLMSR, 0x1);
+#endif
+
+       format_bytes(pbuf, sizeof(pbuf), ptoa(physmem));
+       printf("total memory = %s\n", pbuf);
+
+       minaddr = 0;
+
+       /*
+        * Allocate a submap for exec arguments.  This map effectively
+        * limits the number of processes exec'ing at any time.
+        */
+       exec_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
+                                  16*NCARGS, VM_MAP_PAGEABLE, FALSE, NULL);
+
+       /*
+        * Allocate a submap for physio
+        */
+       phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
+                                  VM_PHYS_SIZE, 0, FALSE, NULL);
+
+       /*
+        * Finally, allocate mbuf cluster submap.
+        */
+       mb_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
+           nmbclusters * mclbytes, VM_MAP_INTRSAFE, FALSE, NULL);
+
+       format_bytes(pbuf, sizeof(pbuf), ptoa(uvmexp.free));
+       printf("avail memory = %s\n", pbuf);
+
+       /* Safe for i/o port / memory space allocation to use malloc now. */
+       x86_bus_space_mallocok();
+}
+
+/*
+ * Set up proc0's TSS and LDT.
+ */
+void
+i386_proc0_tss_ldt_init()
+{
+       struct pcb *pcb;
+       int x;
+
+       gdt_init();
+
+       cpu_info_primary.ci_curpcb = pcb = &lwp0.l_addr->u_pcb;
+
+       pcb->pcb_tss.tss_ioopt =
+           ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16
+               | SEL_KPL;              /* i/o pl */
+
+       for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++)
+               pcb->pcb_iomap[x] = 0xffffffff;
+
+       pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
+       pcb->pcb_cr0 = rcr0();
+       pcb->pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
+       pcb->pcb_tss.tss_esp0 = (int)lwp0.l_addr + USPACE - 16;
+       lwp0.l_md.md_regs = (struct trapframe *)pcb->pcb_tss.tss_esp0 - 1;
+       lwp0.l_md.md_tss_sel = tss_alloc(pcb);
+
+#ifndef XEN
+       ltr(lwp0.l_md.md_tss_sel);
+       lldt(pcb->pcb_ldt_sel);
+#else
+       HYPERVISOR_fpu_taskswitch();
+       XENPRINTF(("lwp tss sp %p ss %04x/%04x\n",
+                     (void *)pcb->pcb_tss.tss_esp0,
+                     pcb->pcb_tss.tss_ss0, IDXSEL(pcb->pcb_tss.tss_ss0)));
+       HYPERVISOR_stack_switch(pcb->pcb_tss.tss_ss0, pcb->pcb_tss.tss_esp0);
+#endif
+}
+
+/*
+ * Set up TSS and LDT for a new PCB.
+ */
+
+void
+i386_init_pcb_tss_ldt(struct cpu_info *ci)
+{
+       int x;
+       struct pcb *pcb = ci->ci_idle_pcb;
+
+       pcb->pcb_tss.tss_ioopt =
+           ((caddr_t)pcb->pcb_iomap - (caddr_t)&pcb->pcb_tss) << 16
+               | SEL_KPL;              /* i/o pl */
+       for (x = 0; x < sizeof(pcb->pcb_iomap) / 4; x++)
+               pcb->pcb_iomap[x] = 0xffffffff;
+
+       pcb->pcb_ldt_sel = pmap_kernel()->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
+       pcb->pcb_cr0 = rcr0();
+
+       ci->ci_idle_tss_sel = tss_alloc(pcb);
+}
+
+/*
+ * Switch context:
+ * - honor CR0_TS in saved CR0 and request DNA exception on FPU use
+ * - switch stack pointer for user->kernel transition
+ */
+void
+i386_switch_context(struct pcb *new)
+{
+       dom0_op_t op;
+       struct cpu_info *ci;
+
+       ci = curcpu();
+       if (ci->ci_fpused) {
+               HYPERVISOR_fpu_taskswitch();
+               ci->ci_fpused = 0;
+       }
+
+       HYPERVISOR_stack_switch(new->pcb_tss.tss_ss0, new->pcb_tss.tss_esp0);
+
+       if (xen_start_info.flags & SIF_PRIVILEGED) {
+               op.cmd = DOM0_IOPL;
+               op.u.iopl.domain = DOMID_SELF;
+               op.u.iopl.iopl = new->pcb_tss.tss_ioopt & SEL_RPL; /* i/o pl */
+               HYPERVISOR_dom0_op(&op);
+       }
+}
+
+/*
+ * sysctl helper routine for machdep.tm* nodes.
+ */
+static int
+sysctl_machdep_tm_longrun(SYSCTLFN_ARGS)
+{
+       struct sysctlnode node;
+       int io, error;
+
+       if (!tmx86_has_longrun)
+               return (EOPNOTSUPP);
+
+       node = *rnode;
+       node.sysctl_data = &io;
+
+       switch (rnode->sysctl_num) {
+       case CPU_TMLR_MODE:
+               io = (int)(crusoe_longrun = tmx86_get_longrun_mode());
+               break;
+       case CPU_TMLR_FREQUENCY:
+               tmx86_get_longrun_status_all();
+               io = crusoe_frequency;
+               break;
+       case CPU_TMLR_VOLTAGE:
+               tmx86_get_longrun_status_all();
+               io = crusoe_voltage;
+               break;
+       case CPU_TMLR_PERCENTAGE:
+               tmx86_get_longrun_status_all();
+               io = crusoe_percentage;
+               break;
+       default:
+               return (EOPNOTSUPP);
+       }
+
+       error = sysctl_lookup(SYSCTLFN_CALL(&node));
+       if (error || newp == NULL)
+               return (error);
+
+       if (rnode->sysctl_num == CPU_TMLR_MODE) {
+               if (tmx86_set_longrun_mode(io))
+                       crusoe_longrun = (u_int)io;
+               else
+                       return (EINVAL);
+       }
+
+       return (0);
+}
+
+/*
+ * sysctl helper routine for machdep.booted_kernel
+ */
+static int
+sysctl_machdep_booted_kernel(SYSCTLFN_ARGS)
+{
+       struct btinfo_bootpath *bibp;
+       struct sysctlnode node;
+
+       bibp = lookup_bootinfo(BTINFO_BOOTPATH);
+       if(!bibp)
+               return(ENOENT); /* ??? */
+
+       node = *rnode;
+       node.sysctl_data = bibp->bootpath;
+       node.sysctl_size = sizeof(bibp->bootpath);
+       return (sysctl_lookup(SYSCTLFN_CALL(&node)));
+}
+
+/*
+ * sysctl helper routine for machdep.diskinfo
+ */
+static int
+sysctl_machdep_diskinfo(SYSCTLFN_ARGS)
+{
+       struct sysctlnode node;
+
+       node = *rnode;
+       node.sysctl_data = i386_alldisks;
+       node.sysctl_size = sizeof(struct disklist) +
+           (i386_ndisks - 1) * sizeof(struct nativedisk_info);
+        return (sysctl_lookup(SYSCTLFN_CALL(&node)));
+}
+
+/*
+ * machine dependent system variables.
+ */
+SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup")
+{
+
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_NODE, "machdep", NULL,
+                      NULL, 0, NULL, 0,
+                      CTL_MACHDEP, CTL_EOL);
+
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_STRUCT, "console_device", NULL,
+                      sysctl_consdev, 0, NULL, sizeof(dev_t),
+                      CTL_MACHDEP, CPU_CONSDEV, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_INT, "biosbasemem", NULL,
+                      NULL, 0, &biosbasemem, 0,
+                      CTL_MACHDEP, CPU_BIOSBASEMEM, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_INT, "biosextmem", NULL,
+                      NULL, 0, &biosextmem, 0,
+                      CTL_MACHDEP, CPU_BIOSEXTMEM, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_INT, "nkpde", NULL,
+                      NULL, 0, &nkpde, 0,
+                      CTL_MACHDEP, CPU_NKPDE, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_STRING, "booted_kernel", NULL,
+                      sysctl_machdep_booted_kernel, 0, NULL, 0,
+                      CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_STRUCT, "diskinfo", NULL,
+                      sysctl_machdep_diskinfo, 0, NULL, 0,
+                      CTL_MACHDEP, CPU_DISKINFO, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_INT, "fpu_present", NULL,
+                      NULL, 0, &i386_fpu_present, 0,
+                      CTL_MACHDEP, CPU_FPU_PRESENT, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_INT, "osfxsr", NULL,
+                      NULL, 0, &i386_use_fxsave, 0,
+                      CTL_MACHDEP, CPU_OSFXSR, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_INT, "sse", NULL,
+                      NULL, 0, &i386_has_sse, 0,
+                      CTL_MACHDEP, CPU_SSE, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_INT, "sse2", NULL,
+                      NULL, 0, &i386_has_sse2, 0,
+                      CTL_MACHDEP, CPU_SSE2, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                      CTLTYPE_INT, "tm_longrun_mode", NULL,
+                      sysctl_machdep_tm_longrun, 0, NULL, 0,
+                      CTL_MACHDEP, CPU_TMLR_MODE, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_INT, "tm_longrun_frequency", NULL,
+                      sysctl_machdep_tm_longrun, 0, NULL, 0,
+                      CTL_MACHDEP, CPU_TMLR_FREQUENCY, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_INT, "tm_longrun_voltage", NULL,
+                      sysctl_machdep_tm_longrun, 0, NULL, 0,
+                      CTL_MACHDEP, CPU_TMLR_VOLTAGE, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_INT, "tm_longrun_percentage", NULL,
+                      sysctl_machdep_tm_longrun, 0, NULL, 0,
+                      CTL_MACHDEP, CPU_TMLR_PERCENTAGE, CTL_EOL);
+}
+
+void *
+getframe(struct lwp *l, int sig, int *onstack)
+{
+       struct proc *p = l->l_proc;
+       struct sigctx *ctx = &p->p_sigctx;
+       struct trapframe *tf = l->l_md.md_regs;
+
+       /* Do we need to jump onto the signal stack? */
+       *onstack = (ctx->ps_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0
+           && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
+       if (*onstack)
+               return (char *)ctx->ps_sigstk.ss_sp + ctx->ps_sigstk.ss_size;
+#ifdef VM86
+       if (tf->tf_eflags & PSL_VM)
+               return (void *)(tf->tf_esp + (tf->tf_ss << 4));
+       else
+#endif
+               return (void *)tf->tf_esp;
+}
+
+/*
+ * Build context to run handler in.  We invoke the handler
+ * directly, only returning via the trampoline.  Note the
+ * trampoline version numbers are coordinated with machine-
+ * dependent code in libc.
+ */
+void
+buildcontext(struct lwp *l, int sel, void *catcher, void *fp)
+{
+       struct trapframe *tf = l->l_md.md_regs;
+
+       tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
+       tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
+       tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
+       tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
+       tf->tf_eip = (int)catcher;
+       tf->tf_cs = GSEL(sel, SEL_UPL);
+       tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC);
+       tf->tf_esp = (int)fp;
+       tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
+}
+
+static void
+sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
+{
+       struct lwp *l = curlwp;
+       struct proc *p = l->l_proc;
+       struct pmap *pmap = vm_map_pmap(&p->p_vmspace->vm_map);
+       int sel = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
+           GUCODEBIG_SEL : GUCODE_SEL;
+       struct sigacts *ps = p->p_sigacts;
+       int onstack;
+       int sig = ksi->ksi_signo;
+       struct sigframe_siginfo *fp = getframe(l, sig, &onstack), frame;
+       sig_t catcher = SIGACTION(p, sig).sa_handler;
+       struct trapframe *tf = l->l_md.md_regs;
+
+       fp--;
+
+       /* Build stack frame for signal trampoline. */
+       switch (ps->sa_sigdesc[sig].sd_vers) {
+       case 0:         /* handled by sendsig_sigcontext */
+       case 1:         /* handled by sendsig_sigcontext */
+       default:        /* unknown version */
+               printf("nsendsig: bad version %d\n",
+                   ps->sa_sigdesc[sig].sd_vers);
+               sigexit(l, SIGILL);
+       case 2:
+               break;
+       }
+
+       frame.sf_ra = (int)ps->sa_sigdesc[sig].sd_tramp;
+       frame.sf_signum = sig;
+       frame.sf_sip = &fp->sf_si;
+       frame.sf_ucp = &fp->sf_uc;
+       frame.sf_si._info = ksi->ksi_info;
+       frame.sf_uc.uc_flags = _UC_SIGMASK|_UC_VM;
+       frame.sf_uc.uc_sigmask = *mask;
+       frame.sf_uc.uc_link = NULL;
+       frame.sf_uc.uc_flags |= (p->p_sigctx.ps_sigstk.ss_flags & SS_ONSTACK)
+           ? _UC_SETSTACK : _UC_CLRSTACK;
+       memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack));
+       cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
+
+       if (tf->tf_eflags & PSL_VM)
+               (*p->p_emul->e_syscall_intern)(p);
+
+       if (copyout(&frame, fp, sizeof(frame)) != 0) {
+               /*
+                * Process has trashed its stack; give it an illegal
+                * instruction to halt it in its tracks.
+                */
+               sigexit(l, SIGILL);
+               /* NOTREACHED */
+       }
+
+       buildcontext(l, sel, catcher, fp);
+
+       /* Remember that we're now on the signal stack. */
+       if (onstack)
+               p->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK;
+}
+
+void
+sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
+{
+#ifdef COMPAT_16
+       if (curproc->p_sigacts->sa_sigdesc[ksi->ksi_signo].sd_vers < 2)
+               sendsig_sigcontext(ksi, mask);
+       else
+#endif
+               sendsig_siginfo(ksi, mask);
+}
+
+void
+cpu_upcall(struct lwp *l, int type, int nevents, int ninterrupted, void *sas,
+    void *ap, void *sp, sa_upcall_t upcall)
+{
+       struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+       struct saframe *sf, frame;
+       struct trapframe *tf;
+
+       tf = l->l_md.md_regs;
+
+       /* Finally, copy out the rest of the frame. */
+       frame.sa_type = type;
+       frame.sa_sas = sas;
+       frame.sa_events = nevents;
+       frame.sa_interrupted = ninterrupted;
+       frame.sa_arg = ap;
+       frame.sa_ra = 0;
+
+       sf = (struct saframe *)sp - 1;
+       if (copyout(&frame, sf, sizeof(frame)) != 0) {
+               /* Copying onto the stack didn't work. Die. */
+               sigexit(l, SIGILL);
+               /* NOTREACHED */
+       }
+
+       tf->tf_eip = (int) upcall;
+       tf->tf_esp = (int) sf;
+       tf->tf_ebp = 0; /* indicate call-frame-top to debuggers */
+       tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
+       tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
+       tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
+       tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
+       tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
+           GSEL(GUCODEBIG_SEL, SEL_UPL) : GSEL(GUCODE_SEL, SEL_UPL);
+       tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
+       tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC);
+}
+
+int    waittime = -1;
+struct pcb dumppcb;
+
+void
+cpu_reboot(int howto, char *bootstr)
+{
+
+       if (cold) {
+               howto |= RB_HALT;
+               goto haltsys;
+       }
+
+       boothowto = howto;
+       if ((howto & RB_NOSYNC) == 0 && waittime < 0) {
+               waittime = 0;
+               vfs_shutdown();
+               /*
+                * If we've been adjusting the clock, the todr
+                * will be out of synch; adjust it now.
+                */
+               if (time_adjusted != 0)
+                       resettodr();
+       }
+
+       /* Disable interrupts. */
+       splhigh();
+
+       /* Do a dump if requested. */
+       if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
+               dumpsys();
+
+haltsys:
+       doshutdownhooks();
+
+#ifdef MULTIPROCESSOR
+       x86_broadcast_ipi(X86_IPI_HALT);
+#endif
+
+       if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
+#if NACPI > 0
+               if (acpi_softc != NULL) {
+                       delay(500000);
+                       acpi_enter_sleep_state(acpi_softc, ACPI_STATE_S5);
+                       printf("WARNING: ACPI powerdown failed!\n");
+               }
+#endif
+#if NAPM > 0 && !defined(APM_NO_POWEROFF)
+               /* turn off, if we can.  But try to turn disk off and
+                * wait a bit first--some disk drives are slow to clean up
+                * and users have reported disk corruption.
+                */
+               delay(500000);
+               apm_set_powstate(APM_DEV_DISK(0xff), APM_SYS_OFF);
+               delay(500000);
+               apm_set_powstate(APM_DEV_ALLDEVS, APM_SYS_OFF);
+               printf("WARNING: APM powerdown failed!\n");
+               /*
+                * RB_POWERDOWN implies RB_HALT... fall into it...
+                */
+#endif
+               HYPERVISOR_shutdown();
+       }
+
+       if (howto & RB_HALT) {
+               printf("\n");
+               printf("The operating system has halted.\n");
+               printf("Please press any key to reboot.\n\n");
+
+#ifdef BEEP_ONHALT
+               {
+                       int c;
+                       for (c = BEEP_ONHALT_COUNT; c > 0; c--) {
+                               sysbeep(BEEP_ONHALT_PITCH,
+                                       BEEP_ONHALT_PERIOD * hz / 1000);
+                               delay(BEEP_ONHALT_PERIOD * 1000);
+                               sysbeep(0, BEEP_ONHALT_PERIOD * hz / 1000);
+                               delay(BEEP_ONHALT_PERIOD * 1000);
+                       }
+               }
+#endif
+
+               cnpollc(1);     /* for proper keyboard command handling */
+               if (cngetc() == 0) {
+                       /* no console attached, so just hlt */
+                       for(;;) {
+                               __asm __volatile("hlt");
+                       }
+               }
+               cnpollc(0);
+       }
+
+       printf("rebooting...\n");
+       if (cpureset_delay > 0)
+               delay(cpureset_delay * 1000);
+       cpu_reset();
+       for(;;) ;
+       /*NOTREACHED*/
+}
+
+/*
+ * These variables are needed by /sbin/savecore
+ */
+u_int32_t dumpmag = 0x8fca0101;        /* magic number */
+int    dumpsize = 0;           /* pages */
+long   dumplo = 0;             /* blocks */
+
+/*
+ * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers.
+ */
+int
+cpu_dumpsize()
+{
+       int size;
+
+       size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
+           ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
+       if (roundup(size, dbtob(1)) != dbtob(1))
+               return (-1);
+
+       return (1);
+}
+
+/*
+ * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped.
+ */
+u_long
+cpu_dump_mempagecnt()
+{
+       u_long i, n;
+
+       n = 0;
+       for (i = 0; i < mem_cluster_cnt; i++)
+               n += atop(mem_clusters[i].size);
+       return (n);
+}
+
+/*
+ * cpu_dump: dump the machine-dependent kernel core dump headers.
+ */
+int
+cpu_dump()
+{
+       int (*dump)(dev_t, daddr_t, caddr_t, size_t);
+       char buf[dbtob(1)];
+       kcore_seg_t *segp;
+       cpu_kcore_hdr_t *cpuhdrp;
+       phys_ram_seg_t *memsegp;
+       const struct bdevsw *bdev;
+       int i;
+
+       bdev = bdevsw_lookup(dumpdev);
+       if (bdev == NULL)
+               return (ENXIO);
+       dump = bdev->d_dump;
+
+       memset(buf, 0, sizeof buf);
+       segp = (kcore_seg_t *)buf;
+       cpuhdrp = (cpu_kcore_hdr_t *)&buf[ALIGN(sizeof(*segp))];
+       memsegp = (phys_ram_seg_t *)&buf[ ALIGN(sizeof(*segp)) +
+           ALIGN(sizeof(*cpuhdrp))];
+
+       /*
+        * Generate a segment header.
+        */
+       CORE_SETMAGIC(*segp, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
+       segp->c_size = dbtob(1) - ALIGN(sizeof(*segp));
+
+       /*
+        * Add the machine-dependent header info.
+        */
+       cpuhdrp->ptdpaddr = PTDpaddr;
+       cpuhdrp->nmemsegs = mem_cluster_cnt;
+
+       /*
+        * Fill in the memory segment descriptors.
+        */
+       for (i = 0; i < mem_cluster_cnt; i++) {
+               memsegp[i].start = mem_clusters[i].start;
+               memsegp[i].size = mem_clusters[i].size;
+       }
+
+       return (dump(dumpdev, dumplo, (caddr_t)buf, dbtob(1)));
+}
+
+/*
+ * This is called by main to set dumplo and dumpsize.
+ * Dumps always skip the first PAGE_SIZE of disk space
+ * in case there might be a disk label stored there.
+ * If there is extra space, put dump at the end to
+ * reduce the chance that swapping trashes it.
+ */
+void
+cpu_dumpconf()
+{
+       const struct bdevsw *bdev;
+       int nblks, dumpblks;    /* size of dump area */
+
+       if (dumpdev == NODEV)
+               goto bad;
+       bdev = bdevsw_lookup(dumpdev);
+       if (bdev == NULL)
+               panic("dumpconf: bad dumpdev=0x%x", dumpdev);
+       if (bdev->d_psize == NULL)
+               goto bad;
+       nblks = (*bdev->d_psize)(dumpdev);
+       if (nblks <= ctod(1))
+               goto bad;
+
+       dumpblks = cpu_dumpsize();
+       if (dumpblks < 0)
+               goto bad;
+       dumpblks += ctod(cpu_dump_mempagecnt());
+
+       /* If dump won't fit (incl. room for possible label), punt. */
+       if (dumpblks > (nblks - ctod(1)))
+               goto bad;
+
+       /* Put dump at end of partition */
+       dumplo = nblks - dumpblks;
+
+       /* dumpsize is in page units, and doesn't include headers. */
+       dumpsize = cpu_dump_mempagecnt();
+       return;
+
+ bad:
+       dumpsize = 0;
+}
+
+/*
+ * Doadump comes here after turning off memory management and
+ * getting on the dump stack, either when called above, or by
+ * the auto-restart code.
+ */
+#define BYTES_PER_DUMP  PAGE_SIZE /* must be a multiple of pagesize XXX small */
+static vaddr_t dumpspace;
+
+vaddr_t
+reserve_dumppages(vaddr_t p)
+{
+
+       dumpspace = p;
+       return (p + BYTES_PER_DUMP);
+}
+
+void
+dumpsys()
+{
+       u_long totalbytesleft, bytes, i, n, memseg;
+       u_long maddr;
+       int psize;
+       daddr_t blkno;
+       const struct bdevsw *bdev;
+       int (*dump)(dev_t, daddr_t, caddr_t, size_t);
+       int error;
+
+       /* Save registers. */
+       savectx(&dumppcb);
+
+       if (dumpdev == NODEV)
+               return;
+
+       bdev = bdevsw_lookup(dumpdev);
+       if (bdev == NULL || bdev->d_psize == NULL)
+               return;
+
+       /*
+        * For dumps during autoconfiguration,
+        * if dump device has already configured...
+        */
+       if (dumpsize == 0)
+               cpu_dumpconf();
+       if (dumplo <= 0 || dumpsize == 0) {
+               printf("\ndump to dev %u,%u not possible\n", major(dumpdev),
+                   minor(dumpdev));
+               return;
+       }
+       printf("\ndumping to dev %u,%u offset %ld\n", major(dumpdev),
+           minor(dumpdev), dumplo);
+
+       psize = (*bdev->d_psize)(dumpdev);
+       printf("dump ");
+       if (psize == -1) {
+               printf("area unavailable\n");
+               return;
+       }
+
+#if 0  /* XXX this doesn't work.  grr. */
+        /* toss any characters present prior to dump */
+       while (sget() != NULL); /*syscons and pccons differ */
+#endif
+
+       if ((error = cpu_dump()) != 0)
+               goto err;
+
+       totalbytesleft = ptoa(cpu_dump_mempagecnt());
+       blkno = dumplo + cpu_dumpsize();
+       dump = bdev->d_dump;
+       error = 0;
+
+       for (memseg = 0; memseg < mem_cluster_cnt; memseg++) {
+               maddr = mem_clusters[memseg].start;
+               bytes = mem_clusters[memseg].size;
+
+               for (i = 0; i < bytes; i += n, totalbytesleft -= n) {
+                       /* Print out how many MBs we have left to go. */
+                       if ((totalbytesleft % (1024*1024)) == 0)
+                               printf("%ld ", totalbytesleft / (1024 * 1024));
+
+                       /* Limit size for next transfer. */
+                       n = bytes - i;
+                       if (n > BYTES_PER_DUMP)
+                               n = BYTES_PER_DUMP;
+
+                       (void) pmap_map(dumpspace, maddr, maddr + n,
+                           VM_PROT_READ);
+
+                       error = (*dump)(dumpdev, blkno, (caddr_t)dumpspace, n);
+                       if (error)
+                               goto err;
+                       maddr += n;
+                       blkno += btodb(n);              /* XXX? */
+
+#if 0  /* XXX this doesn't work.  grr. */
+                       /* operator aborting dump? */
+                       if (sget() != NULL) {
+                               error = EINTR;
+                               break;
+                       }
+#endif
+               }
+       }
+
+ err:
+       switch (error) {
+
+       case ENXIO:
+               printf("device bad\n");
+               break;
+
+       case EFAULT:
+               printf("device not ready\n");
+               break;
+
+       case EINVAL:
+               printf("area improper\n");
+               break;
+
+       case EIO:
+               printf("i/o error\n");
+               break;
+
+       case EINTR:
+               printf("aborted from console\n");
+               break;
+
+       case 0:
+               printf("succeeded\n");
+               break;
+
+       default:
+               printf("error %d\n", error);
+               break;
+       }
+       printf("\n\n");
+       delay(5000000);         /* 5 seconds */
+}
+
+/*
+ * Clear registers on exec
+ */
+void
+setregs(struct lwp *l, struct exec_package *pack, u_long stack)
+{
+       struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+       struct pcb *pcb = &l->l_addr->u_pcb;
+       struct trapframe *tf;
+
+#if NNPX > 0
+       /* If we were using the FPU, forget about it. */
+       if (l->l_addr->u_pcb.pcb_fpcpu != NULL)
+               npxsave_lwp(l, 0);
+#endif
+
+#ifdef USER_LDT
+       pmap_ldt_cleanup(l);
+#endif
+
+       l->l_md.md_flags &= ~MDL_USEDFPU;
+       if (i386_use_fxsave) {
+               pcb->pcb_savefpu.sv_xmm.sv_env.en_cw = __NetBSD_NPXCW__;
+               pcb->pcb_savefpu.sv_xmm.sv_env.en_mxcsr = __INITIAL_MXCSR__;
+       } else
+               pcb->pcb_savefpu.sv_87.sv_env.en_cw = __NetBSD_NPXCW__;
+
+       tf = l->l_md.md_regs;
+       tf->tf_gs = LSEL(LUDATA_SEL, SEL_UPL);
+       tf->tf_fs = LSEL(LUDATA_SEL, SEL_UPL);
+       tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
+       tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
+       tf->tf_edi = 0;
+       tf->tf_esi = 0;
+       tf->tf_ebp = 0;
+       tf->tf_ebx = (int)l->l_proc->p_psstr;
+       tf->tf_edx = 0;
+       tf->tf_ecx = 0;
+       tf->tf_eax = 0;
+       tf->tf_eip = pack->ep_entry;
+       tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
+           LSEL(LUCODEBIG_SEL, SEL_UPL) : LSEL(LUCODE_SEL, SEL_UPL);
+       tf->tf_eflags = PSL_USERSET;
+       tf->tf_esp = stack;
+       tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
+}
+
+/*
+ * Initialize segments and descriptor tables
+ */
+
+union  descriptor *gdt, *ldt;
+struct gate_descriptor *idt;
+char idt_allocmap[NIDT];
+struct simplelock idt_lock = SIMPLELOCK_INITIALIZER;
+#ifdef I586_CPU
+union  descriptor *pentium_idt;
+#endif
+extern  struct user *proc0paddr;
+
+void
+setgate(struct gate_descriptor *gd, void *func, int args, int type, int dpl,
+    int sel)
+{
+
+       gd->gd_looffset = (int)func;
+       gd->gd_selector = sel;
+       gd->gd_stkcpy = args;
+       gd->gd_xx = 0;
+       gd->gd_type = type;
+       gd->gd_dpl = dpl;
+       gd->gd_p = 1;
+       gd->gd_hioffset = (int)func >> 16;
+}
+
+void
+unsetgate(struct gate_descriptor *gd)
+{
+       gd->gd_p = 0;
+       gd->gd_hioffset = 0;
+       gd->gd_looffset = 0;
+       gd->gd_selector = 0;
+       gd->gd_xx = 0;
+       gd->gd_stkcpy = 0;
+       gd->gd_type = 0;
+       gd->gd_dpl = 0;
+}
+
+
+void
+setregion(struct region_descriptor *rd, void *base, size_t limit)
+{
+
+       rd->rd_limit = (int)limit;
+       rd->rd_base = (int)base;
+}
+
+void
+setsegment(struct segment_descriptor *sd, void *base, size_t limit, int type,
+    int dpl, int def32, int gran)
+{
+
+       sd->sd_lolimit = (int)limit;
+       sd->sd_lobase = (int)base;
+       sd->sd_type = type;
+       sd->sd_dpl = dpl;
+       sd->sd_p = 1;
+       sd->sd_hilimit = (int)limit >> 16;
+       sd->sd_xx = 0;
+       sd->sd_def32 = def32;
+       sd->sd_gran = gran;
+       sd->sd_hibase = (int)base >> 24;
+}
+
+#define        IDTVEC(name)    __CONCAT(X, name)
+typedef void (vector)(void);
+extern vector IDTVEC(syscall);
+extern vector IDTVEC(osyscall);
+extern vector *IDTVEC(exceptions)[];
+#ifdef COMPAT_SVR4
+extern vector IDTVEC(svr4_fasttrap);
+#endif /* COMPAT_SVR4 */
+#ifdef COMPAT_MACH
+extern vector IDTVEC(mach_trap);
+#endif
+#define MAX_XEN_IDT 128
+trap_info_t xen_idt[MAX_XEN_IDT];
+int xen_idt_idx;
+
+#define        KBTOB(x)        ((size_t)(x) * 1024UL)
+
+void cpu_init_idt()
+{
+       struct region_descriptor region;
+
+       panic("cpu_init_idt");
+#ifdef I586_CPU
+       setregion(&region, pentium_idt, NIDT * sizeof(idt[0]) - 1);
+#else
+       setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
+#endif
+        lidt(&region);
+}
+
+#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
+void
+add_mem_cluster(u_int64_t seg_start, u_int64_t seg_end, u_int32_t type)
+{
+       extern struct extent *iomem_ex;
+       int i;
+
+       if (seg_end > 0x100000000ULL) {
+               printf("WARNING: skipping large "
+                   "memory map entry: "
+                   "0x%qx/0x%qx/0x%x\n",
+                   seg_start,
+                   (seg_end - seg_start),
+                   type);
+               return;
+       }
+
+       /*
+        * XXX Chop the last page off the size so that
+        * XXX it can fit in avail_end.
+        */
+       if (seg_end == 0x100000000ULL)
+               seg_end -= PAGE_SIZE;
+
+       if (seg_end <= seg_start)
+               return;
+
+       for (i = 0; i < mem_cluster_cnt; i++) {
+               if ((mem_clusters[i].start == round_page(seg_start))
+                   && (mem_clusters[i].size
+                           == trunc_page(seg_end) - mem_clusters[i].start)) {
+#ifdef DEBUG_MEMLOAD
+                       printf("WARNING: skipping duplicate segment entry\n");
+#endif
+                       return;
+               }
+       }
+
+       /*
+        * Allocate the physical addresses used by RAM
+        * from the iomem extent map.  This is done before
+        * the addresses are page rounded just to make
+        * sure we get them all.
+        */
+       if (extent_alloc_region(iomem_ex, seg_start,
+           seg_end - seg_start, EX_NOWAIT)) {
+               /* XXX What should we do? */
+               printf("WARNING: CAN'T ALLOCATE "
+                   "MEMORY SEGMENT "
+                   "(0x%qx/0x%qx/0x%x) FROM "
+                   "IOMEM EXTENT MAP!\n",
+                   seg_start, seg_end - seg_start, type);
+               return;
+       }
+
+       /*
+        * If it's not free memory, skip it.
+        */
+       if (type != BIM_Memory)
+               return;
+
+       /* XXX XXX XXX */
+       if (mem_cluster_cnt >= VM_PHYSSEG_MAX)
+               panic("init386: too many memory segments");
+
+       seg_start = round_page(seg_start);
+       seg_end = trunc_page(seg_end);
+
+       if (seg_start == seg_end)
+               return;
+
+       mem_clusters[mem_cluster_cnt].start = seg_start;
+       mem_clusters[mem_cluster_cnt].size =
+           seg_end - seg_start;
+
+       if (avail_end < seg_end)
+               avail_end = seg_end;
+       physmem += atop(mem_clusters[mem_cluster_cnt].size);
+       mem_cluster_cnt++;
+}
+#endif /* !defined(REALBASEMEM) && !defined(REALEXTMEM) */
+
+void
+initgdt()
+{
+#if !defined(XEN)
+       struct region_descriptor region;
+#else
+       paddr_t frames[16];
+#endif
+
+#if !defined(XEN)
+       gdt = tgdt;
+       memset(gdt, 0, NGDT*sizeof(*gdt));
+#endif
+       /* make gdt gates and memory segments */
+       setsegment(&gdt[GCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 1, 1);
+       setsegment(&gdt[GDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 1, 1);
+       setsegment(&gdt[GUCODE_SEL].sd, 0, x86_btop(I386_MAX_EXE_ADDR) - 1,
+           SDT_MEMERA, SEL_UPL, 1, 1);
+       setsegment(&gdt[GUCODEBIG_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1,
+           SDT_MEMERA, SEL_UPL, 1, 1);
+       setsegment(&gdt[GUDATA_SEL].sd, 0, x86_btop(VM_MAXUSER_ADDRESS) - 1,
+           SDT_MEMRWA, SEL_UPL, 1, 1);
+#ifdef COMPAT_MACH
+       setgate(&gdt[GMACHCALLS_SEL].gd, &IDTVEC(mach_trap), 1,
+           SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+#endif
+#if NBIOSCALL > 0
+       /* bios trampoline GDT entries */
+       setsegment(&gdt[GBIOSCODE_SEL].sd, 0, 0xfc3ff, SDT_MEMERA, SEL_KPL, 0,
+           0);
+       setsegment(&gdt[GBIOSDATA_SEL].sd, 0, 0xfc3ff, SDT_MEMRWA, SEL_KPL, 0,
+           0);
+#endif
+       setsegment(&gdt[GCPU_SEL].sd, &cpu_info_primary,
+           sizeof(struct cpu_info)-1, SDT_MEMRWA, SEL_KPL, 1, 1);
+
+#if !defined(XEN)
+       setregion(&region, gdt, NGDT * sizeof(gdt[0]) - 1);
+       lgdt(&region);
+#else
+       frames[0] = xpmap_ptom((uint32_t)gdt - KERNBASE) >> PAGE_SHIFT;
+       /* pmap_kremove((vaddr_t)gdt, PAGE_SIZE); */
+       pmap_kenter_pa((vaddr_t)gdt, (uint32_t)gdt - KERNBASE,
+           VM_PROT_READ);
+       XENPRINTK(("loading gdt %lx, %d entries\n", frames[0] << PAGE_SHIFT,
+           LAST_RESERVED_GDT_ENTRY + 1));
+       if (HYPERVISOR_set_gdt(frames, LAST_RESERVED_GDT_ENTRY + 1))
+               panic("HYPERVISOR_set_gdt failed!\n");
+       lgdt_finish();
+#endif
+}
+
+void
+init386(paddr_t first_avail)
+{
+#if !defined(XEN)
+       union descriptor *tgdt;
+#endif
+       extern void consinit(void);
+#if !defined(XEN)
+       extern struct extent *iomem_ex;
+#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
+       struct btinfo_memmap *bim;
+#endif
+       struct region_descriptor region;
+#endif
+       int x;
+#if !defined(XEN)
+       int first16q;
+       u_int64_t seg_start, seg_end;
+       u_int64_t seg_start1, seg_end1;
+#endif
+       paddr_t realmode_reserved_start;
+       psize_t realmode_reserved_size;
+       int needs_earlier_install_pte0;
+#if NBIOSCALL > 0
+       extern int biostramp_image_size;
+       extern u_char biostramp_image[];
+#endif
+
+       XENPRINTK(("HYPERVISOR_shared_info %p\n", HYPERVISOR_shared_info));
+#ifdef XENDEBUG_LOW
+       xen_dbglow_init();
+#endif
+
+       cpu_probe_features(&cpu_info_primary);
+       cpu_feature = cpu_info_primary.ci_feature_flags;
+
+       /* not on Xen... */
+       cpu_feature &= ~(CPUID_PGE|CPUID_PSE|CPUID_MTRR|CPUID_FXSR);
+
+       lwp0.l_addr = proc0paddr;
+       cpu_info_primary.ci_curpcb = &lwp0.l_addr->u_pcb;
+
+       XENPRINTK(("proc0paddr %p pcb %p first_avail %p\n",
+           proc0paddr, cpu_info_primary.ci_curpcb, (void *)first_avail));
+       XENPRINTK(("ptdpaddr %p atdevbase %p\n", (void *)PTDpaddr,
+                     (void *)atdevbase));
+
+       x86_bus_space_init();
+       consinit();     /* XXX SHOULD NOT BE DONE HERE */
+       /*
+        * Initailize PAGE_SIZE-dependent variables.
+        */
+       uvm_setpagesize();
+
+       /*
+        * Saving SSE registers won't work if the save area isn't
+        * 16-byte aligned.
+        */
+       if (offsetof(struct user, u_pcb.pcb_savefpu) & 0xf)
+               panic("init386: pcb_savefpu not 16-byte aligned");
+
+       /*
+        * Start with 2 color bins -- this is just a guess to get us
+        * started.  We'll recolor when we determine the largest cache
+        * sizes on the system.
+        */
+       uvmexp.ncolors = 2;
+
+#if !defined(XEN)
+       /*
+        * BIOS leaves data in physical page 0
+        * Even if it didn't, our VM system doesn't like using zero as a
+        * physical page number.
+        * We may also need pages in low memory (one each) for secondary CPU
+        * startup, for BIOS calls, and for ACPI, plus a page table page to map
+        * them into the first few pages of the kernel's pmap.
+        */
+       avail_start = PAGE_SIZE;
+#else
+       /* Make sure the end of the space used by the kernel is rounded. */
+       first_avail = round_page(first_avail);
+       avail_start = first_avail - KERNBASE;
+       avail_end = ptoa(xen_start_info.nr_pages) +
+               (KERNTEXTOFF - KERNBASE_LOCORE);
+       pmap_pa_start = (KERNTEXTOFF - KERNBASE_LOCORE);
+       pmap_pa_end = avail_end;
+       mem_clusters[0].start = avail_start;
+       mem_clusters[0].size = avail_end - avail_start;
+       mem_cluster_cnt++;
+       physmem += atop(mem_clusters[0].size);
+#endif
+
+       /*
+        * reserve memory for real-mode call
+        */
+       needs_earlier_install_pte0 = 0;
+       realmode_reserved_start = 0;
+       realmode_reserved_size = 0;
+#if NBIOSCALL > 0
+       /* save us a page for trampoline code */
+       realmode_reserved_size += PAGE_SIZE;
+       needs_earlier_install_pte0 = 1;
+#endif
+#ifdef MULTIPROCESSOR                                           /* XXX */
+#if !defined(XEN)
+       KASSERT(avail_start == PAGE_SIZE);                       /* XXX */
+#endif
+       if (realmode_reserved_size < MP_TRAMPOLINE)              /* XXX */
+               realmode_reserved_size = MP_TRAMPOLINE;          /* XXX */
+       needs_earlier_install_pte0 = 1;                          /* XXX */
+#endif                                                          /* XXX */
+#if NACPI > 0
+       /* trampoline code for wake handler */
+       realmode_reserved_size += ptoa(acpi_md_get_npages_of_wakecode()+1);
+       needs_earlier_install_pte0 = 1;
+#endif
+       if (needs_earlier_install_pte0) {
+               /* page table for directory entry 0 */
+               realmode_reserved_size += PAGE_SIZE;
+       }
+       if (realmode_reserved_size>0) {
+               realmode_reserved_start = avail_start;
+               avail_start += realmode_reserved_size;
+       }
+
+#ifdef DEBUG_MEMLOAD
+       printf("mem_cluster_count: %d\n", mem_cluster_cnt);
+#endif
+
+       /*
+        * Call pmap initialization to make new kernel address space.
+        * We must do this before loading pages into the VM system.
+        */
+       pmap_bootstrap((vaddr_t)atdevbase + IOM_SIZE);
+
+#if !defined(XEN)
+#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
+       /*
+        * Check to see if we have a memory map from the BIOS (passed
+        * to us by the boot program.
+        */
+       bim = lookup_bootinfo(BTINFO_MEMMAP);
+       if (bim != NULL && bim->num > 0) {
+#ifdef DEBUG_MEMLOAD
+               printf("BIOS MEMORY MAP (%d ENTRIES):\n", bim->num);
+#endif
+               for (x = 0; x < bim->num; x++) {
+#ifdef DEBUG_MEMLOAD
+                       printf("    addr 0x%qx  size 0x%qx  type 0x%x\n",
+                           bim->entry[x].addr,
+                           bim->entry[x].size,
+                           bim->entry[x].type);
+#endif
+
+                       /*
+                        * If the segment is not memory, skip it.
+                        */
+                       switch (bim->entry[x].type) {
+                       case BIM_Memory:
+                       case BIM_ACPI:
+                       case BIM_NVS:
+                               break;
+                       default:
+                               continue;
+                       }
+
+                       /*
+                        * Sanity check the entry.
+                        * XXX Need to handle uint64_t in extent code
+                        * XXX and 64-bit physical addresses in i386
+                        * XXX port.
+                        */
+                       seg_start = bim->entry[x].addr;
+                       seg_end = bim->entry[x].addr + bim->entry[x].size;
+
+                       /*
+                        *   Avoid Compatibility Holes.
+                        * XXX  Holes within memory space that allow access
+                        * XXX to be directed to the PC-compatible frame buffer
+                        * XXX (0xa0000-0xbffff),to adapter ROM space
+                        * XXX (0xc0000-0xdffff), and to system BIOS space
+                        * XXX (0xe0000-0xfffff).
+                        * XXX  Some laptop(for example,Toshiba Satellite2550X)
+                        * XXX report this area and occurred problems,
+                        * XXX so we avoid this area.
+                        */
+                       if (seg_start < 0x100000 && seg_end > 0xa0000) {
+                               printf("WARNING: memory map entry overlaps "
+                                   "with ``Compatibility Holes'': "
+                                   "0x%qx/0x%qx/0x%x\n", seg_start,
+                                   seg_end - seg_start, bim->entry[x].type);
+                               add_mem_cluster(seg_start, 0xa0000,
+                                   bim->entry[x].type);
+                               add_mem_cluster(0x100000, seg_end,
+                                   bim->entry[x].type);
+                       } else
+                               add_mem_cluster(seg_start, seg_end,
+                                   bim->entry[x].type);
+               }
+       }
+#endif /* ! REALBASEMEM && ! REALEXTMEM */
+       /*
+        * If the loop above didn't find any valid segment, fall back to
+        * former code.
+        */
+       if (mem_cluster_cnt == 0) {
+               /*
+                * Allocate the physical addresses used by RAM from the iomem
+                * extent map.  This is done before the addresses are
+                * page rounded just to make sure we get them all.
+                */
+               if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem),
+                   EX_NOWAIT)) {
+                       /* XXX What should we do? */
+                       printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM "
+                           "IOMEM EXTENT MAP!\n");
+               }
+               mem_clusters[0].start = 0;
+               mem_clusters[0].size = trunc_page(KBTOB(biosbasemem));
+               physmem += atop(mem_clusters[0].size);
+               if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem),
+                   EX_NOWAIT)) {
+                       /* XXX What should we do? */
+                       printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM "
+                           "IOMEM EXTENT MAP!\n");
+               }
+#if NISADMA > 0
+               /*
+                * Some motherboards/BIOSes remap the 384K of RAM that would
+                * normally be covered by the ISA hole to the end of memory
+                * so that it can be used.  However, on a 16M system, this
+                * would cause bounce buffers to be allocated and used.
+                * This is not desirable behaviour, as more than 384K of
+                * bounce buffers might be allocated.  As a work-around,
+                * we round memory down to the nearest 1M boundary if
+                * we're using any isadma devices and the remapped memory
+                * is what puts us over 16M.
+                */
+               if (biosextmem > (15*1024) && biosextmem < (16*1024)) {
+                       char pbuf[9];
+
+                       format_bytes(pbuf, sizeof(pbuf),
+                           biosextmem - (15*1024));
+                       printf("Warning: ignoring %s of remapped memory\n",
+                           pbuf);
+                       biosextmem = (15*1024);
+               }
+#endif
+               mem_clusters[1].start = IOM_END;
+               mem_clusters[1].size = trunc_page(KBTOB(biosextmem));
+               physmem += atop(mem_clusters[1].size);
+
+               mem_cluster_cnt = 2;
+
+               avail_end = IOM_END + trunc_page(KBTOB(biosextmem));
+       }
+       /*
+        * If we have 16M of RAM or less, just put it all on
+        * the default free list.  Otherwise, put the first
+        * 16M of RAM on a lower priority free list (so that
+        * all of the ISA DMA'able memory won't be eaten up
+        * first-off).
+        */
+       if (avail_end <= (16 * 1024 * 1024))
+               first16q = VM_FREELIST_DEFAULT;
+       else
+               first16q = VM_FREELIST_FIRST16;
+
+       /* Make sure the end of the space used by the kernel is rounded. */
+       first_avail = round_page(first_avail);
+#endif
+
+       XENPRINTK(("load the memory cluster %p(%d) - %p(%ld)\n",
+           (void *)avail_start, (int)atop(avail_start),
+           (void *)avail_end, (int)atop(avail_end)));
+       uvm_page_physload(atop(avail_start), atop(avail_end),
+           atop(avail_start), atop(avail_end),
+           VM_FREELIST_DEFAULT);
+
+#if !defined(XEN)
+
+       /*
+        * Now, load the memory clusters (which have already been
+        * rounded and truncated) into the VM system.
+        *
+        * NOTE: WE ASSUME THAT MEMORY STARTS AT 0 AND THAT THE KERNEL
+        * IS LOADED AT IOM_END (1M).
+        */
+       for (x = 0; x < mem_cluster_cnt; x++) {
+               seg_start = mem_clusters[x].start;
+               seg_end = mem_clusters[x].start + mem_clusters[x].size;
+               seg_start1 = 0;
+               seg_end1 = 0;
+
+               /*
+                * Skip memory before our available starting point.
+                */
+               if (seg_end <= avail_start)
+                       continue;
+
+               if (avail_start >= seg_start && avail_start < seg_end) {
+                       if (seg_start != 0)
+                               panic("init386: memory doesn't start at 0");
+                       seg_start = avail_start;
+                       if (seg_start == seg_end)
+                               continue;
+               }
+
+               /*
+                * If this segment contains the kernel, split it
+                * in two, around the kernel.
+                */
+               if (seg_start <= IOM_END && first_avail <= seg_end) {
+                       seg_start1 = first_avail;
+                       seg_end1 = seg_end;
+                       seg_end = IOM_END;
+               }
+
+               /* First hunk */
+               if (seg_start != seg_end) {
+                       if (seg_start < (16 * 1024 * 1024) &&
+                           first16q != VM_FREELIST_DEFAULT) {
+                               u_int64_t tmp;
+
+                               if (seg_end > (16 * 1024 * 1024))
+                                       tmp = (16 * 1024 * 1024);
+                               else
+                                       tmp = seg_end;
+
+                               if (tmp != seg_start) {
+#ifdef DEBUG_MEMLOAD
+                                       printf("loading 0x%qx-0x%qx "
+                                           "(0x%lx-0x%lx)\n",
+                                           seg_start, tmp,
+                                           atop(seg_start), atop(tmp));
+#endif
+                                       uvm_page_physload(atop(seg_start),
+                                           atop(tmp), atop(seg_start),
+                                           atop(tmp), first16q);
+                               }
+                               seg_start = tmp;
+                       }
+
+                       if (seg_start != seg_end) {
+#ifdef DEBUG_MEMLOAD
+                               printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n",
+                                   seg_start, seg_end,
+                                   atop(seg_start), atop(seg_end));
+#endif
+                               uvm_page_physload(atop(seg_start),
+                                   atop(seg_end), atop(seg_start),
+                                   atop(seg_end), VM_FREELIST_DEFAULT);
+                       }
+               }
+
+               /* Second hunk */
+               if (seg_start1 != seg_end1) {
+                       if (seg_start1 < (16 * 1024 * 1024) &&
+                           first16q != VM_FREELIST_DEFAULT) {
+                               u_int64_t tmp;
+
+                               if (seg_end1 > (16 * 1024 * 1024))
+                                       tmp = (16 * 1024 * 1024);
+                               else
+                                       tmp = seg_end1;
+
+                               if (tmp != seg_start1) {
+#ifdef DEBUG_MEMLOAD
+                                       printf("loading 0x%qx-0x%qx "
+                                           "(0x%lx-0x%lx)\n",
+                                           seg_start1, tmp,
+                                           atop(seg_start1), atop(tmp));
+#endif
+                                       uvm_page_physload(atop(seg_start1),
+                                           atop(tmp), atop(seg_start1),
+                                           atop(tmp), first16q);
+                               }
+                               seg_start1 = tmp;
+                       }
+
+                       if (seg_start1 != seg_end1) {
+#ifdef DEBUG_MEMLOAD
+                               printf("loading 0x%qx-0x%qx (0x%lx-0x%lx)\n",
+                                   seg_start1, seg_end1,
+                                   atop(seg_start1), atop(seg_end1));
+#endif
+                               uvm_page_physload(atop(seg_start1),
+                                   atop(seg_end1), atop(seg_start1),
+                                   atop(seg_end1), VM_FREELIST_DEFAULT);
+                       }
+               }
+       }
+#endif
+
+       /*
+        * Steal memory for the message buffer (at end of core).
+        */
+       {
+               struct vm_physseg *vps;
+               psize_t sz = round_page(MSGBUFSIZE);
+               psize_t reqsz = sz;
+
+               for (x = 0; x < vm_nphysseg; x++) {
+                       vps = &vm_physmem[x];
+                       if (ptoa(vps->avail_end) == avail_end)
+                               goto found;
+               }
+               panic("init386: can't find end of memory");
+
+       found:
+               /* Shrink so it'll fit in the last segment. */
+               if ((vps->avail_end - vps->avail_start) < atop(sz))
+                       sz = ptoa(vps->avail_end - vps->avail_start);
+
+               vps->avail_end -= atop(sz);
+               vps->end -= atop(sz);
+               msgbuf_paddr = ptoa(vps->avail_end);
+
+               /* Remove the last segment if it now has no pages. */
+               if (vps->start == vps->end) {
+                       for (vm_nphysseg--; x < vm_nphysseg; x++)
+                               vm_physmem[x] = vm_physmem[x + 1];
+               }
+
+               /* Now find where the new avail_end is. */
+               for (avail_end = 0, x = 0; x < vm_nphysseg; x++)
+                       if (vm_physmem[x].avail_end > avail_end)
+                               avail_end = vm_physmem[x].avail_end;
+               avail_end = ptoa(avail_end);
+
+               /* Warn if the message buffer had to be shrunk. */
+               if (sz != reqsz)
+                       printf("WARNING: %ld bytes not available for msgbuf "
+                           "in last cluster (%ld used)\n", reqsz, sz);
+       }
+
+       /*
+        * install PT page for the first 4M if needed.
+        */
+       if (needs_earlier_install_pte0) {
+               paddr_t paddr;
+#ifdef DIAGNOSTIC
+               if (realmode_reserved_size < PAGE_SIZE) {
+                       panic("cannot steal memory for first 4M PT page.");
+               }
+#endif
+               paddr=realmode_reserved_start+realmode_reserved_size-PAGE_SIZE;
+               pmap_enter(pmap_kernel(), (vaddr_t)vtopte(0), paddr,
+                          VM_PROT_READ|VM_PROT_WRITE,
+                          PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE);
+               pmap_update(pmap_kernel());
+               /* make sure it is clean before using */
+               memset(vtopte(0), 0, PAGE_SIZE);
+               realmode_reserved_size -= PAGE_SIZE;
+       }
+
+#if NBIOSCALL > 0
+       /*
+        * this should be caught at kernel build time, but put it here
+        * in case someone tries to fake it out...
+        */
+#ifdef DIAGNOSTIC
+       if (realmode_reserved_start > BIOSTRAMP_BASE ||
+           (realmode_reserved_start+realmode_reserved_size) < (BIOSTRAMP_BASE+
+                                                              PAGE_SIZE)) {
+           panic("cannot steal memory for PT page of bioscall.");
+       }
+       if (biostramp_image_size > PAGE_SIZE)
+           panic("biostramp_image_size too big: %x vs. %x",
+                 biostramp_image_size, PAGE_SIZE);
+#endif
+       pmap_kenter_pa((vaddr_t)BIOSTRAMP_BASE, /* virtual */
+                      (paddr_t)BIOSTRAMP_BASE, /* physical */
+                      VM_PROT_ALL);            /* protection */
+       pmap_update(pmap_kernel());
+       memcpy((caddr_t)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size);
+#ifdef DEBUG_BIOSCALL
+       printf("biostramp installed @ %x\n", BIOSTRAMP_BASE);
+#endif
+       realmode_reserved_size  -= PAGE_SIZE;
+       realmode_reserved_start += PAGE_SIZE;
+#endif
+
+#if NACPI > 0
+       /*
+        * Steal memory for the acpi wake code
+        */
+       {
+               paddr_t paddr, p;
+               psize_t sz;
+               int npg;
+
+               paddr = realmode_reserved_start;
+               npg = acpi_md_get_npages_of_wakecode();
+               sz = ptoa(npg);
+#ifdef DIAGNOSTIC
+               if (realmode_reserved_size < sz) {
+                       panic("cannot steal memory for ACPI wake code.");
+               }
+#endif
+
+               /* identical mapping */
+               p = paddr;
+               for (x=0; x<npg; x++) {
+                       printf("kenter: 0x%08X\n", (unsigned)p);
+                       pmap_kenter_pa((vaddr_t)p, p, VM_PROT_ALL);
+                       p += PAGE_SIZE;
+               }
+               pmap_update(pmap_kernel());
+
+               acpi_md_install_wakecode(paddr);
+
+               realmode_reserved_size  -= sz;
+               realmode_reserved_start += sz;
+       }
+#endif
+
+       pmap_enter(pmap_kernel(), idt_vaddr, idt_paddr,
+           VM_PROT_READ|VM_PROT_WRITE, PMAP_WIRED|VM_PROT_READ|VM_PROT_WRITE);
+       pmap_update(pmap_kernel());
+       memset((void *)idt_vaddr, 0, PAGE_SIZE);
+
+#if !defined(XEN)
+       idt = (struct gate_descriptor *)idt_vaddr;
+#ifdef I586_CPU
+       pmap_enter(pmap_kernel(), pentium_idt_vaddr, idt_paddr,
+           VM_PROT_READ, PMAP_WIRED|VM_PROT_READ);
+       pentium_idt = (union descriptor *)pentium_idt_vaddr;
+#endif
+#endif
+       pmap_update(pmap_kernel());
+
+       initgdt();
+
+       HYPERVISOR_set_callbacks(
+               GSEL(GCODE_SEL, SEL_KPL), (unsigned long)hypervisor_callback,
+               GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
+
+#if !defined(XEN)
+       tgdt = gdt;
+       gdt = (union descriptor *)
+                   ((char *)idt + NIDT * sizeof (struct gate_descriptor));
+       ldt = gdt + NGDT;
+
+       memcpy(gdt, tgdt, NGDT*sizeof(*gdt));
+
+       setsegment(&gdt[GLDT_SEL].sd, ldt, NLDT * sizeof(ldt[0]) - 1,
+           SDT_SYSLDT, SEL_KPL, 0, 0);
+#else
+       ldt = (union descriptor *)idt_vaddr;
+#endif
+
+       /* make ldt gates and memory segments */
+       setgate(&ldt[LSYS5CALLS_SEL].gd, &IDTVEC(osyscall), 1,
+           SDT_SYS386CGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+
+       ldt[LUCODE_SEL] = gdt[GUCODE_SEL];
+       ldt[LUCODEBIG_SEL] = gdt[GUCODEBIG_SEL];
+       ldt[LUDATA_SEL] = gdt[GUDATA_SEL];
+       ldt[LSOL26CALLS_SEL] = ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
+
+#if !defined(XEN)
+       /* exceptions */
+       for (x = 0; x < 32; x++) {
+               setgate(&idt[x], IDTVEC(exceptions)[x], 0, SDT_SYS386TGT,
+                   (x == 3 || x == 4) ? SEL_UPL : SEL_KPL,
+                   GSEL(GCODE_SEL, SEL_KPL));
+               idt_allocmap[x] = 1;
+       }
+
+       /* new-style interrupt gate for syscalls */
+       setgate(&idt[128], &IDTVEC(syscall), 0, SDT_SYS386TGT, SEL_UPL,
+           GSEL(GCODE_SEL, SEL_KPL));
+       idt_allocmap[128] = 1;
+#ifdef COMPAT_SVR4
+       setgate(&idt[0xd2], &IDTVEC(svr4_fasttrap), 0, SDT_SYS386TGT,
+           SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
+       idt_allocmap[0xd2] = 1;
+#endif /* COMPAT_SVR4 */
+#endif
+
+       memset(xen_idt, 0, sizeof(trap_info_t) * MAX_XEN_IDT);
+       xen_idt_idx = 0;
+       for (x = 0; x < 32; x++) {
+               KASSERT(xen_idt_idx < MAX_XEN_IDT);
+               xen_idt[xen_idt_idx].vector = x;
+               xen_idt[xen_idt_idx].flags =
+                       (x == 3 || x == 4) ? SEL_UPL : SEL_XEN;
+               xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
+               xen_idt[xen_idt_idx].address =
+                       (uint32_t)IDTVEC(exceptions)[x];
+               xen_idt_idx++;
+       }
+       KASSERT(xen_idt_idx < MAX_XEN_IDT);
+       xen_idt[xen_idt_idx].vector = 128;
+       xen_idt[xen_idt_idx].flags = SEL_UPL;
+       xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
+       xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(syscall);
+       xen_idt_idx++;
+#ifdef COMPAT_SVR4
+       KASSERT(xen_idt_idx < MAX_XEN_IDT);
+       xen_idt[xen_idt_idx].vector = 0xd2;
+       xen_idt[xen_idt_idx].flags = SEL_UPL;
+       xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL);
+       xen_idt[xen_idt_idx].address = (uint32_t)&IDTVEC(svr4_fasttrap);
+       xen_idt_idx++;
+#endif /* COMPAT_SVR4 */
+
+#if !defined(XEN)
+       setregion(&region, gdt, NGDT * sizeof(gdt[0]) - 1);
+       lgdt(&region);
+#else
+       lldt(GSEL(GLDT_SEL, SEL_KPL));
+#endif
+
+#if !defined(XEN)
+       cpu_init_idt();
+#else
+       db_trap_callback = ddb_trap_hook;
+
+       XENPRINTF(("HYPERVISOR_set_trap_table %p\n", xen_idt));
+       if (HYPERVISOR_set_trap_table(xen_idt))
+               panic("HYPERVISOR_set_trap_table %p failed\n", xen_idt);
+#endif
+
+#if NKSYMS || defined(DDB) || defined(LKM)
+       {
+               extern int end;
+               extern int *esym;
+               struct btinfo_symtab *symtab;
+
+#ifdef DDB
+               db_machine_init();
+#endif
+
+               symtab = lookup_bootinfo(BTINFO_SYMTAB);
+
+               if (symtab) {
+                       symtab->ssym += KERNBASE;
+                       symtab->esym += KERNBASE;
+                       ksyms_init(symtab->nsym, (int *)symtab->ssym,
+                           (int *)symtab->esym);
+               }
+               else
+                       ksyms_init(*(int *)&end, ((int *)&end) + 1, esym);
+       }
+#endif
+#ifdef DDB
+       if (boothowto & RB_KDB)
+               Debugger();
+#endif
+#ifdef IPKDB
+       ipkdb_init();
+       if (boothowto & RB_KDB)
+               ipkdb_connect(0);
+#endif
+#ifdef KGDB
+       kgdb_port_init();
+       if (boothowto & RB_KDB) {
+               kgdb_debug_init = 1;
+               kgdb_connect(1);
+       }
+#endif
+
+#if NMCA > 0
+       /* check for MCA bus, needed to be done before ISA stuff - if
+        * MCA is detected, ISA needs to use level triggered interrupts
+        * by default */
+       mca_busprobe();
+#endif
+
+#if defined(XEN)
+       events_default_setup();
+#else
+       intr_default_setup();
+#endif
+
+       /* Initialize software interrupts. */
+       softintr_init();
+
+       splraise(IPL_IPI);
+       enable_intr();
+
+       if (physmem < btoc(2 * 1024 * 1024)) {
+               printf("warning: too little memory available; "
+                      "have %lu bytes, want %lu bytes\n"
+                      "running in degraded mode\n"
+                      "press a key to confirm\n\n",
+                      ptoa(physmem), 2*1024*1024UL);
+               cngetc();
+       }
+
+#ifdef __HAVE_CPU_MAXPROC
+       /* Make sure maxproc is sane */
+       if (maxproc > cpu_maxproc())
+               maxproc = cpu_maxproc();
+#endif
+}
+
+#ifdef COMPAT_NOMID
+static int
+exec_nomid(struct proc *p, struct exec_package *epp)
+{
+       int error;
+       u_long midmag, magic;
+       u_short mid;
+       struct exec *execp = epp->ep_hdr;
+
+       /* check on validity of epp->ep_hdr performed by exec_out_makecmds */
+
+       midmag = ntohl(execp->a_midmag);
+       mid = (midmag >> 16) & 0xffff;
+       magic = midmag & 0xffff;
+
+       if (magic == 0) {
+               magic = (execp->a_midmag & 0xffff);
+               mid = MID_ZERO;
+       }
+
+       midmag = mid << 16 | magic;
+
+       switch (midmag) {
+       case (MID_ZERO << 16) | ZMAGIC:
+               /*
+                * 386BSD's ZMAGIC format:
+                */
+               error = exec_aout_prep_oldzmagic(p, epp);
+               break;
+
+       case (MID_ZERO << 16) | QMAGIC:
+               /*
+                * BSDI's QMAGIC format:
+                * same as new ZMAGIC format, but with different magic number
+                */
+               error = exec_aout_prep_zmagic(p, epp);
+               break;
+
+       case (MID_ZERO << 16) | NMAGIC:
+               /*
+                * BSDI's NMAGIC format:
+                * same as NMAGIC format, but with different magic number
+                * and with text starting at 0.
+                */
+               error = exec_aout_prep_oldnmagic(p, epp);
+               break;
+
+       case (MID_ZERO << 16) | OMAGIC:
+               /*
+                * BSDI's OMAGIC format:
+                * same as OMAGIC format, but with different magic number
+                * and with text starting at 0.
+                */
+               error = exec_aout_prep_oldomagic(p, epp);
+               break;
+
+       default:
+               error = ENOEXEC;
+       }
+
+       return error;
+}
+#endif
+
+/*
+ * cpu_exec_aout_makecmds():
+ *     CPU-dependent a.out format hook for execve().
+ *
+ * Determine of the given exec package refers to something which we
+ * understand and, if so, set up the vmcmds for it.
+ *
+ * On the i386, old (386bsd) ZMAGIC binaries and BSDI QMAGIC binaries
+ * if COMPAT_NOMID is given as a kernel option.
+ */
+int
+cpu_exec_aout_makecmds(struct proc *p, struct exec_package *epp)
+{
+       int error = ENOEXEC;
+
+#ifdef COMPAT_NOMID
+       if ((error = exec_nomid(p, epp)) == 0)
+               return error;
+#endif /* ! COMPAT_NOMID */
+
+       return error;
+}
+
+void *
+lookup_bootinfo(int type)
+{
+       struct btinfo_common *help;
+       int n = *(int*)bootinfo;
+       help = (struct btinfo_common *)(bootinfo + sizeof(int));
+       while(n--) {
+               if(help->type == type)
+                       return(help);
+               help = (struct btinfo_common *)((char*)help + help->len);
+       }
+       return(0);
+}
+
+#include <dev/ic/mc146818reg.h>                /* for NVRAM POST */
+#include <i386/isa/nvram.h>            /* for NVRAM POST */
+
+void
+cpu_reset()
+{
+
+       disable_intr();
+
+#if 0
+       /*
+        * Ensure the NVRAM reset byte contains something vaguely sane.
+        */
+
+       outb(IO_RTC, NVRAM_RESET);
+       outb(IO_RTC+1, NVRAM_RESET_RST);
+
+       /*
+        * The keyboard controller has 4 random output pins, one of which is
+        * connected to the RESET pin on the CPU in many PCs.  We tell the
+        * keyboard controller to pulse this line a couple of times.
+        */
+       outb(IO_KBD + KBCMDP, KBC_PULSE0);
+       delay(100000);
+       outb(IO_KBD + KBCMDP, KBC_PULSE0);
+       delay(100000);
+#endif
+
+       HYPERVISOR_reboot();
+
+       for (;;);
+}
+
+void
+cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
+{
+       const struct trapframe *tf = l->l_md.md_regs;
+       __greg_t *gr = mcp->__gregs;
+       __greg_t ras_eip;
+
+       /* Save register context. */
+#ifdef VM86
+       if (tf->tf_eflags & PSL_VM) {
+               gr[_REG_GS]  = tf->tf_vm86_gs;
+               gr[_REG_FS]  = tf->tf_vm86_fs;
+               gr[_REG_ES]  = tf->tf_vm86_es;
+               gr[_REG_DS]  = tf->tf_vm86_ds;
+               gr[_REG_EFL] = get_vflags(l);
+       } else
+#endif
+       {
+               gr[_REG_GS]  = tf->tf_gs;
+               gr[_REG_FS]  = tf->tf_fs;
+               gr[_REG_ES]  = tf->tf_es;
+               gr[_REG_DS]  = tf->tf_ds;
+               gr[_REG_EFL] = tf->tf_eflags;
+       }
+       gr[_REG_EDI]    = tf->tf_edi;
+       gr[_REG_ESI]    = tf->tf_esi;
+       gr[_REG_EBP]    = tf->tf_ebp;
+       gr[_REG_EBX]    = tf->tf_ebx;
+       gr[_REG_EDX]    = tf->tf_edx;
+       gr[_REG_ECX]    = tf->tf_ecx;
+       gr[_REG_EAX]    = tf->tf_eax;
+       gr[_REG_EIP]    = tf->tf_eip;
+       gr[_REG_CS]     = tf->tf_cs;
+       gr[_REG_ESP]    = tf->tf_esp;
+       gr[_REG_UESP]   = tf->tf_esp;
+       gr[_REG_SS]     = tf->tf_ss;
+       gr[_REG_TRAPNO] = tf->tf_trapno;
+       gr[_REG_ERR]    = tf->tf_err;
+
+       if ((ras_eip = (__greg_t)ras_lookup(l->l_proc,
+           (caddr_t) gr[_REG_EIP])) != -1)
+               gr[_REG_EIP] = ras_eip;
+
+       *flags |= _UC_CPU;
+
+       /* Save floating point register context, if any. */
+       if ((l->l_md.md_flags & MDL_USEDFPU) != 0) {
+#if NNPX > 0
+               /*
+                * If this process is the current FP owner, dump its
+                * context to the PCB first.
+                * XXX npxsave() also clears the FPU state; depending on the
+                * XXX application this might be a penalty.
+                */
+               if (l->l_addr->u_pcb.pcb_fpcpu) {
+                       npxsave_lwp(l, 1);
+               }
+#endif
+               if (i386_use_fxsave) {
+                       memcpy(&mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
+                           &l->l_addr->u_pcb.pcb_savefpu.sv_xmm,
+                           sizeof (mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm));
+                       *flags |= _UC_FXSAVE;
+               } else {
+                       memcpy(&mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
+                           &l->l_addr->u_pcb.pcb_savefpu.sv_87,
+                           sizeof (mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state));
+               }
+#if 0
+               /* Apparently nothing ever touches this. */
+               ucp->mcp.mc_fp.fp_emcsts = l->l_addr->u_pcb.pcb_saveemc;
+#endif
+               *flags |= _UC_FPU;
+       }
+}
+
+int
+cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
+{
+       struct trapframe *tf = l->l_md.md_regs;
+       __greg_t *gr = mcp->__gregs;
+
+       /* Restore register context, if any. */
+       if ((flags & _UC_CPU) != 0) {
+#ifdef VM86
+               if (gr[_REG_EFL] & PSL_VM) {
+                       tf->tf_vm86_gs = gr[_REG_GS];
+                       tf->tf_vm86_fs = gr[_REG_FS];
+                       tf->tf_vm86_es = gr[_REG_ES];
+                       tf->tf_vm86_ds = gr[_REG_DS];
+                       set_vflags(l, gr[_REG_EFL]);
+                       if (flags & _UC_VM) {
+                               void syscall_vm86(struct trapframe *);
+                               l->l_proc->p_md.md_syscall = syscall_vm86;
+                       }
+               } else
+#endif
+               {
+                       /*
+                        * Check for security violations.  If we're returning
+                        * to protected mode, the CPU will validate the segment
+                        * registers automatically and generate a trap on
+                        * violations.  We handle the trap, rather than doing
+                        * all of the checking here.
+                        */
+                       if (((gr[_REG_EFL] ^ tf->tf_eflags) & PSL_USERSTATIC) ||
+                           !USERMODE(gr[_REG_CS], gr[_REG_EFL])) {
+                               printf("cpu_setmcontext error: uc EFL: 0x%08x"
+                                   " tf EFL: 0x%08x uc CS: 0x%x\n",
+                                   gr[_REG_EFL], tf->tf_eflags, gr[_REG_CS]);
+                               return (EINVAL);
+                       }
+                       tf->tf_gs = gr[_REG_GS];
+                       tf->tf_fs = gr[_REG_FS];
+                       tf->tf_es = gr[_REG_ES];
+                       tf->tf_ds = gr[_REG_DS];
+                       /* Only change the user-alterable part of eflags */
+                       tf->tf_eflags &= ~PSL_USER;
+                       tf->tf_eflags |= (gr[_REG_EFL] & PSL_USER);
+               }
+               tf->tf_edi    = gr[_REG_EDI];
+               tf->tf_esi    = gr[_REG_ESI];
+               tf->tf_ebp    = gr[_REG_EBP];
+               tf->tf_ebx    = gr[_REG_EBX];
+               tf->tf_edx    = gr[_REG_EDX];
+               tf->tf_ecx    = gr[_REG_ECX];
+               tf->tf_eax    = gr[_REG_EAX];
+               tf->tf_eip    = gr[_REG_EIP];
+               tf->tf_cs     = gr[_REG_CS];
+               tf->tf_esp    = gr[_REG_UESP];
+               tf->tf_ss     = gr[_REG_SS];
+       }
+
+       /* Restore floating point register context, if any. */
+       if ((flags & _UC_FPU) != 0) {
+#if NNPX > 0
+               /*
+                * If we were using the FPU, forget that we were.
+                */
+               if (l->l_addr->u_pcb.pcb_fpcpu != NULL)
+                       npxsave_lwp(l, 0);
+#endif
+               if (flags & _UC_FXSAVE) {
+                       if (i386_use_fxsave) {
+                               memcpy(
+                                       &l->l_addr->u_pcb.pcb_savefpu.sv_xmm,
+                                       &mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
+                                       sizeof (&l->l_addr->u_pcb.pcb_savefpu.sv_xmm));
+                       } else {
+                               /* This is a weird corner case */
+                               process_xmm_to_s87((struct savexmm *)
+                                   &mcp->__fpregs.__fp_reg_set.__fp_xmm_state.__fp_xmm,
+                                   &l->l_addr->u_pcb.pcb_savefpu.sv_87);
+                       }
+               } else {
+                       if (i386_use_fxsave) {
+                               process_s87_to_xmm((struct save87 *)
+                                   &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
+                                   &l->l_addr->u_pcb.pcb_savefpu.sv_xmm);
+                       } else {
+                               memcpy(&l->l_addr->u_pcb.pcb_savefpu.sv_87,
+                                   &mcp->__fpregs.__fp_reg_set.__fpchip_state.__fp_state,
+                                   sizeof (l->l_addr->u_pcb.pcb_savefpu.sv_87));
+                       }
+               }
+               /* If not set already. */
+               l->l_md.md_flags |= MDL_USEDFPU;
+#if 0
+               /* Apparently unused. */
+               l->l_addr->u_pcb.pcb_saveemc = mcp->mc_fp.fp_emcsts;
+#endif
+       }
+       if (flags & _UC_SETSTACK)
+               l->l_proc->p_sigctx.ps_sigstk.ss_flags |= SS_ONSTACK;
+       if (flags & _UC_CLRSTACK)
+               l->l_proc->p_sigctx.ps_sigstk.ss_flags &= ~SS_ONSTACK;
+       return (0);
+}
+
+void
+cpu_initclocks()
+{
+       (*initclock_func)();
+}
+
+#ifdef MULTIPROCESSOR
+void
+need_resched(struct cpu_info *ci)
+{
+
+       if (ci->ci_want_resched)
+               return;
+
+       ci->ci_want_resched = 1;
+       if ((ci)->ci_curlwp != NULL)
+               aston((ci)->ci_curlwp->l_proc);
+       else if (ci != curcpu())
+               x86_send_ipi(ci, 0);
+}
+#endif
+
+/*
+ * Allocate an IDT vector slot within the given range.
+ * XXX needs locking to avoid MP allocation races.
+ */
+
+int
+idt_vec_alloc(int low, int high)
+{
+       int vec;
+
+       simple_lock(&idt_lock);
+       for (vec = low; vec <= high; vec++) {
+               if (idt_allocmap[vec] == 0) {
+                       idt_allocmap[vec] = 1;
+                       simple_unlock(&idt_lock);
+                       return vec;
+               }
+       }
+       simple_unlock(&idt_lock);
+       return 0;
+}
+
+void
+idt_vec_set(int vec, void (*function)(void))
+{
+       /*
+        * Vector should be allocated, so no locking needed.
+        */
+       KASSERT(idt_allocmap[vec] == 1);
+       setgate(&idt[vec], function, 0, SDT_SYS386IGT, SEL_KPL,
+           GSEL(GCODE_SEL, SEL_KPL));
+}
+
+void
+idt_vec_free(int vec)
+{
+       simple_lock(&idt_lock);
+       unsetgate(&idt[vec]);
+       idt_allocmap[vec] = 0;
+       simple_unlock(&idt_lock);
+}
+
+/*
+ * Number of processes is limited by number of available GDT slots.
+ */
+int
+cpu_maxproc(void)
+{
+#ifdef USER_LDT
+       return ((MAXGDTSIZ - NGDT) / 2);
+#else
+       return (MAXGDTSIZ - NGDT);
+#endif
+}
+
+#if defined(DDB) || defined(KGDB)
+
+/* 
+ * Callback to output a backtrace when entering ddb.
+ */
+void
+ddb_trap_hook(int where)
+{
+       static int once = 0;
+       db_addr_t db_dot;
+
+       if (once != 0 || where != 1)
+               return;
+       once = 1;
+
+       if (curlwp != NULL) {
+               db_printf("Stopped");
+               if (curproc == NULL)
+                       db_printf("; curlwp = %p,"
+                           " curproc is NULL at\t", curlwp);
+               else
+                       db_printf(" in pid %d.%d (%s) at\t", 
+                           curproc->p_pid, curlwp->l_lid,
+                           curproc->p_comm);
+       } else
+               db_printf("Stopped at\t");
+       db_dot = PC_REGS(DDB_REGS);
+       db_print_loc_and_inst(db_dot);
+
+       db_stack_trace_print((db_expr_t) db_dot, FALSE, 65535,
+           "", db_printf);
+#ifdef DEBUG
+       db_show_regs((db_expr_t) db_dot, FALSE, 65535, "");
+#endif
+}
+
+#endif /* DDB || KGDB */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c

new file mode 100644 (file)

index 0000000..8e031eb
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c
@@ -0,0 +1,4522 @@
+/*     $NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $  */
+/*     NetBSD: pmap.c,v 1.172 2004/04/12 13:17:46 yamt Exp     */
+
+/*
+ *
+ * Copyright (c) 1997 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Charles D. Cranor and
+ *      Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * pmap.c: i386 pmap module rewrite
+ * Chuck Cranor <chuck@ccrc.wustl.edu>
+ * 11-Aug-97
+ *
+ * history of this pmap module: in addition to my own input, i used
+ *    the following references for this rewrite of the i386 pmap:
+ *
+ * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
+ *     BSD hp300 pmap done by Mike Hibler at University of Utah.
+ *     it was then ported to the i386 by William Jolitz of UUNET
+ *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
+ *     project fixed some bugs and provided some speed ups.
+ *
+ * [2] the FreeBSD i386 pmap.   this pmap seems to be the
+ *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
+ *     and David Greenman.
+ *
+ * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
+ *     between several processors.   the VAX version was done by
+ *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
+ *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
+ *     David Golub, and Richard Draves.    the alpha version was
+ *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
+ *     (NetBSD/alpha).
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.1.2.1 2004/05/22 15:57:52 he Exp $");
+
+#include "opt_cputype.h"
+#include "opt_user_ldt.h"
+#include "opt_largepages.h"
+#include "opt_lockdebug.h"
+#include "opt_multiprocessor.h"
+#include "opt_kstack_dr0.h"
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/user.h>
+#include <sys/kernel.h>
+
+#include <uvm/uvm.h>
+
+#include <machine/atomic.h>
+#include <machine/cpu.h>
+#include <machine/specialreg.h>
+#include <machine/gdt.h>
+
+#include <dev/isa/isareg.h>
+#include <machine/isa_machdep.h>
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/xenpmap.h>
+
+void xpmap_find_pte(paddr_t);
+
+/* #define XENDEBUG */
+
+#ifdef XENDEBUG
+#define        XENPRINTF(x) printf x
+#define        XENPRINTK(x) printf x
+#else
+#define        XENPRINTF(x)
+#define        XENPRINTK(x)
+#endif
+#define        PRINTF(x) printf x
+#define        PRINTK(x) printf x
+
+
+/*
+ * general info:
+ *
+ *  - for an explanation of how the i386 MMU hardware works see
+ *    the comments in <machine/pte.h>.
+ *
+ *  - for an explanation of the general memory structure used by
+ *    this pmap (including the recursive mapping), see the comments
+ *    in <machine/pmap.h>.
+ *
+ * this file contains the code for the "pmap module."   the module's
+ * job is to manage the hardware's virtual to physical address mappings.
+ * note that there are two levels of mapping in the VM system:
+ *
+ *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
+ *      to map ranges of virtual address space to objects/files.  for
+ *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
+ *      to the file /bin/ls starting at offset zero."   note that
+ *      the upper layer mapping is not concerned with how individual
+ *      vm_pages are mapped.
+ *
+ *  [2] the lower layer of the VM system (the pmap) maintains the mappings
+ *      from virtual addresses.   it is concerned with which vm_page is
+ *      mapped where.   for example, when you run /bin/ls and start
+ *      at page 0x1000 the fault routine may lookup the correct page
+ *      of the /bin/ls file and then ask the pmap layer to establish
+ *      a mapping for it.
+ *
+ * note that information in the lower layer of the VM system can be
+ * thrown away since it can easily be reconstructed from the info
+ * in the upper layer.
+ *
+ * data structures we use include:
+ *
+ *  - struct pmap: describes the address space of one thread
+ *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
+ *  - struct pv_head: there is one pv_head per managed page of
+ *     physical memory.   the pv_head points to a list of pv_entry
+ *     structures which describe all the <PMAP,VA> pairs that this
+ *      page is mapped in.    this is critical for page based operations
+ *      such as pmap_page_protect() [change protection on _all_ mappings
+ *      of a page]
+ *  - pv_page/pv_page_info: pv_entry's are allocated out of pv_page's.
+ *      if we run out of pv_entry's we allocate a new pv_page and free
+ *      its pv_entrys.
+ * - pmap_remove_record: a list of virtual addresses whose mappings
+ *     have been changed.   used for TLB flushing.
+ */
+
+/*
+ * memory allocation
+ *
+ *  - there are three data structures that we must dynamically allocate:
+ *
+ * [A] new process' page directory page (PDP)
+ *     - plan 1: done at pmap_create() we use
+ *       uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
+ *       allocation.
+ *
+ * if we are low in free physical memory then we sleep in
+ * uvm_km_alloc -- in this case this is ok since we are creating
+ * a new pmap and should not be holding any locks.
+ *
+ * if the kernel is totally out of virtual space
+ * (i.e. uvm_km_alloc returns NULL), then we panic.
+ *
+ * XXX: the fork code currently has no way to return an "out of
+ * memory, try again" error code since uvm_fork [fka vm_fork]
+ * is a void function.
+ *
+ * [B] new page tables pages (PTP)
+ *     - call uvm_pagealloc()
+ *             => success: zero page, add to pm_pdir
+ *             => failure: we are out of free vm_pages, let pmap_enter()
+ *                tell UVM about it.
+ *
+ * note: for kernel PTPs, we start with NKPTP of them.   as we map
+ * kernel memory (at uvm_map time) we check to see if we've grown
+ * the kernel pmap.   if so, we call the optional function
+ * pmap_growkernel() to grow the kernel PTPs in advance.
+ *
+ * [C] pv_entry structures
+ *     - plan 1: try to allocate one off the free list
+ *             => success: done!
+ *             => failure: no more free pv_entrys on the list
+ *     - plan 2: try to allocate a new pv_page to add a chunk of
+ *     pv_entrys to the free list
+ *             [a] obtain a free, unmapped, VA in kmem_map.  either
+ *             we have one saved from a previous call, or we allocate
+ *             one now using a "vm_map_lock_try" in uvm_map
+ *             => success: we have an unmapped VA, continue to [b]
+ *             => failure: unable to lock kmem_map or out of VA in it.
+ *                     move on to plan 3.
+ *             [b] allocate a page in kmem_object for the VA
+ *             => success: map it in, free the pv_entry's, DONE!
+ *             => failure: kmem_object locked, no free vm_pages, etc.
+ *                     save VA for later call to [a], go to plan 3.
+ *     If we fail, we simply let pmap_enter() tell UVM about it.
+ */
+
+/*
+ * locking
+ *
+ * we have the following locks that we must contend with:
+ *
+ * "normal" locks:
+ *
+ *  - pmap_main_lock
+ *    this lock is used to prevent deadlock and/or provide mutex
+ *    access to the pmap system.   most operations lock the pmap
+ *    structure first, then they lock the pv_lists (if needed).
+ *    however, some operations such as pmap_page_protect lock
+ *    the pv_lists and then lock pmaps.   in order to prevent a
+ *    cycle, we require a mutex lock when locking the pv_lists
+ *    first.   thus, the "pmap = >pv_list" lockers must gain a
+ *    read-lock on pmap_main_lock before locking the pmap.   and
+ *    the "pv_list => pmap" lockers must gain a write-lock on
+ *    pmap_main_lock before locking.    since only one thread
+ *    can write-lock a lock at a time, this provides mutex.
+ *
+ * "simple" locks:
+ *
+ * - pmap lock (per pmap, part of uvm_object)
+ *   this lock protects the fields in the pmap structure including
+ *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
+ *   in the alternate PTE space (since that is determined by the
+ *   entry in the PDP).
+ *
+ * - pvh_lock (per pv_head)
+ *   this lock protects the pv_entry list which is chained off the
+ *   pv_head structure for a specific managed PA.   it is locked
+ *   when traversing the list (e.g. adding/removing mappings,
+ *   syncing R/M bits, etc.)
+ *
+ * - pvalloc_lock
+ *   this lock protects the data structures which are used to manage
+ *   the free list of pv_entry structures.
+ *
+ * - pmaps_lock
+ *   this lock protects the list of active pmaps (headed by "pmaps").
+ *   we lock it when adding or removing pmaps from this list.
+ *
+ */
+
+/*
+ * locking data structures
+ */
+
+static struct simplelock pvalloc_lock;
+static struct simplelock pmaps_lock;
+
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
+static struct lock pmap_main_lock;
+
+#define PMAP_MAP_TO_HEAD_LOCK() \
+     (void) spinlockmgr(&pmap_main_lock, LK_SHARED, NULL)
+#define PMAP_MAP_TO_HEAD_UNLOCK() \
+     (void) spinlockmgr(&pmap_main_lock, LK_RELEASE, NULL)
+
+#define PMAP_HEAD_TO_MAP_LOCK() \
+     (void) spinlockmgr(&pmap_main_lock, LK_EXCLUSIVE, NULL)
+#define PMAP_HEAD_TO_MAP_UNLOCK() \
+     spinlockmgr(&pmap_main_lock, LK_RELEASE, (void *) 0)
+
+#else
+
+#define PMAP_MAP_TO_HEAD_LOCK()                /* null */
+#define PMAP_MAP_TO_HEAD_UNLOCK()      /* null */
+
+#define PMAP_HEAD_TO_MAP_LOCK()                /* null */
+#define PMAP_HEAD_TO_MAP_UNLOCK()      /* null */
+
+#endif
+
+#define COUNT(x)       /* nothing */
+
+/*
+ * TLB Shootdown:
+ *
+ * When a mapping is changed in a pmap, the TLB entry corresponding to
+ * the virtual address must be invalidated on all processors.  In order
+ * to accomplish this on systems with multiple processors, messages are
+ * sent from the processor which performs the mapping change to all
+ * processors on which the pmap is active.  For other processors, the
+ * ASN generation numbers for that processor is invalidated, so that
+ * the next time the pmap is activated on that processor, a new ASN
+ * will be allocated (which implicitly invalidates all TLB entries).
+ *
+ * Shootdown job queue entries are allocated using a simple special-
+ * purpose allocator for speed.
+ */
+struct pmap_tlb_shootdown_job {
+       TAILQ_ENTRY(pmap_tlb_shootdown_job) pj_list;
+       vaddr_t pj_va;                  /* virtual address */
+       pmap_t pj_pmap;                 /* the pmap which maps the address */
+       pt_entry_t pj_pte;              /* the PTE bits */
+       struct pmap_tlb_shootdown_job *pj_nextfree;
+};
+
+#define PMAP_TLB_SHOOTDOWN_JOB_ALIGN 32
+union pmap_tlb_shootdown_job_al {
+       struct pmap_tlb_shootdown_job pja_job;
+       char pja_align[PMAP_TLB_SHOOTDOWN_JOB_ALIGN];
+};
+
+struct pmap_tlb_shootdown_q {
+       TAILQ_HEAD(, pmap_tlb_shootdown_job) pq_head;
+       int pq_pte;                     /* aggregate PTE bits */
+       int pq_count;                   /* number of pending requests */
+       __cpu_simple_lock_t pq_slock;   /* spin lock on queue */
+       int pq_flushg;          /* pending flush global */
+       int pq_flushu;          /* pending flush user */
+} pmap_tlb_shootdown_q[X86_MAXPROCS];
+
+#define        PMAP_TLB_MAXJOBS        16
+
+void   pmap_tlb_shootdown_q_drain(struct pmap_tlb_shootdown_q *);
+struct pmap_tlb_shootdown_job *pmap_tlb_shootdown_job_get
+          (struct pmap_tlb_shootdown_q *);
+void   pmap_tlb_shootdown_job_put(struct pmap_tlb_shootdown_q *,
+           struct pmap_tlb_shootdown_job *);
+
+__cpu_simple_lock_t pmap_tlb_shootdown_job_lock;
+union pmap_tlb_shootdown_job_al *pj_page, *pj_free;
+
+/*
+ * global data structures
+ */
+
+struct pmap kernel_pmap_store; /* the kernel's pmap (proc0) */
+
+/*
+ * nkpde is the number of kernel PTPs allocated for the kernel at
+ * boot time (NKPTP is a compile time override).   this number can
+ * grow dynamically as needed (but once allocated, we never free
+ * kernel PTPs).
+ */
+
+int nkpde = NKPTP;
+#ifdef NKPDE
+#error "obsolete NKPDE: use NKPTP"
+#endif
+
+/*
+ * pmap_pg_g: if our processor supports PG_G in the PTE then we
+ * set pmap_pg_g to PG_G (otherwise it is zero).
+ */
+
+int pmap_pg_g = 0;
+
+#ifdef LARGEPAGES
+/*
+ * pmap_largepages: if our processor supports PG_PS and we are
+ * using it, this is set to TRUE.
+ */
+
+int pmap_largepages;
+#endif
+
+/*
+ * i386 physical memory comes in a big contig chunk with a small
+ * hole toward the front of it...  the following two paddr_t's
+ * (shared with machdep.c) describe the physical address space
+ * of this machine.
+ */
+paddr_t avail_start;   /* PA of first available physical page */
+paddr_t avail_end;     /* PA of last available physical page */
+
+paddr_t pmap_pa_start; /* PA of first physical page for this domain */
+paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
+
+       /* MA of last physical page of the machine */
+paddr_t pmap_mem_end = HYPERVISOR_VIRT_START; /* updated for domain-0 */
+
+/*
+ * other data structures
+ */
+
+static pt_entry_t protection_codes[8];     /* maps MI prot to i386 prot code */
+static boolean_t pmap_initialized = FALSE; /* pmap_init done yet? */
+
+/*
+ * the following two vaddr_t's are used during system startup
+ * to keep track of how much of the kernel's VM space we have used.
+ * once the system is started, the management of the remaining kernel
+ * VM space is turned over to the kernel_map vm_map.
+ */
+
+static vaddr_t virtual_avail;  /* VA of first free KVA */
+static vaddr_t virtual_end;    /* VA of last free KVA */
+
+
+/*
+ * pv_page management structures: locked by pvalloc_lock
+ */
+
+TAILQ_HEAD(pv_pagelist, pv_page);
+static struct pv_pagelist pv_freepages;        /* list of pv_pages with free entrys */
+static struct pv_pagelist pv_unusedpgs; /* list of unused pv_pages */
+static int pv_nfpvents;                        /* # of free pv entries */
+static struct pv_page *pv_initpage;    /* bootstrap page from kernel_map */
+static vaddr_t pv_cachedva;            /* cached VA for later use */
+
+#define PVE_LOWAT (PVE_PER_PVPAGE / 2) /* free pv_entry low water mark */
+#define PVE_HIWAT (PVE_LOWAT + (PVE_PER_PVPAGE * 2))
+                                       /* high water mark */
+
+static __inline int
+pv_compare(struct pv_entry *a, struct pv_entry *b)
+{
+       if (a->pv_pmap < b->pv_pmap)
+               return (-1);
+       else if (a->pv_pmap > b->pv_pmap)
+               return (1);
+       else if (a->pv_va < b->pv_va)
+               return (-1);
+       else if (a->pv_va > b->pv_va)
+               return (1);
+       else
+               return (0);
+}
+
+SPLAY_PROTOTYPE(pvtree, pv_entry, pv_node, pv_compare);
+SPLAY_GENERATE(pvtree, pv_entry, pv_node, pv_compare);
+
+/*
+ * linked list of all non-kernel pmaps
+ */
+
+static struct pmap_head pmaps;
+
+/*
+ * pool that pmap structures are allocated from
+ */
+
+struct pool pmap_pmap_pool;
+
+/*
+ * MULTIPROCESSOR: special VA's/ PTE's are actually allocated inside a
+ * X86_MAXPROCS*NPTECL array of PTE's, to avoid cache line thrashing
+ * due to false sharing.
+ */
+
+#ifdef MULTIPROCESSOR
+#define PTESLEW(pte, id) ((pte)+(id)*NPTECL)
+#define VASLEW(va,id) ((va)+(id)*NPTECL*PAGE_SIZE)
+#else
+#define PTESLEW(pte, id) (pte)
+#define VASLEW(va,id) (va)
+#endif
+
+/*
+ * special VAs and the PTEs that map them
+ */
+static pt_entry_t *csrc_pte, *cdst_pte, *zero_pte, *ptp_pte;
+static caddr_t csrcp, cdstp, zerop, ptpp;
+
+/*
+ * pool and cache that PDPs are allocated from
+ */
+
+struct pool pmap_pdp_pool;
+struct pool_cache pmap_pdp_cache;
+u_int pmap_pdp_cache_generation;
+
+int    pmap_pdp_ctor(void *, void *, int);
+void   pmap_pdp_dtor(void *, void *);
+
+caddr_t vmmap; /* XXX: used by mem.c... it should really uvm_map_reserve it */
+
+extern vaddr_t msgbuf_vaddr;
+extern paddr_t msgbuf_paddr;
+
+extern vaddr_t idt_vaddr;                      /* we allocate IDT early */
+extern paddr_t idt_paddr;
+
+#if defined(I586_CPU)
+/* stuff to fix the pentium f00f bug */
+extern vaddr_t pentium_idt_vaddr;
+#endif
+
+
+/*
+ * local prototypes
+ */
+
+static struct pv_entry *pmap_add_pvpage(struct pv_page *, boolean_t);
+static struct vm_page  *pmap_alloc_ptp(struct pmap *, int);
+static struct pv_entry *pmap_alloc_pv(struct pmap *, int); /* see codes below */
+#define ALLOCPV_NEED   0       /* need PV now */
+#define ALLOCPV_TRY    1       /* just try to allocate, don't steal */
+#define ALLOCPV_NONEED 2       /* don't need PV, just growing cache */
+static struct pv_entry *pmap_alloc_pvpage(struct pmap *, int);
+static void             pmap_enter_pv(struct pv_head *,
+                                      struct pv_entry *, struct pmap *,
+                                      vaddr_t, struct vm_page *);
+static void             pmap_free_pv(struct pmap *, struct pv_entry *);
+static void             pmap_free_pvs(struct pmap *, struct pv_entry *);
+static void             pmap_free_pv_doit(struct pv_entry *);
+static void             pmap_free_pvpage(void);
+static struct vm_page  *pmap_get_ptp(struct pmap *, int);
+static boolean_t        pmap_is_curpmap(struct pmap *);
+static boolean_t        pmap_is_active(struct pmap *, int);
+static pt_entry_t      *pmap_map_ptes(struct pmap *);
+static struct pv_entry *pmap_remove_pv(struct pv_head *, struct pmap *,
+                                       vaddr_t);
+static void             pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
+static boolean_t        pmap_remove_pte(struct pmap *, struct vm_page *,
+                                        pt_entry_t *, vaddr_t, int32_t *, int);
+static void             pmap_remove_ptes(struct pmap *, struct vm_page *,
+                                         vaddr_t, vaddr_t, vaddr_t, int32_t *,
+                                         int);
+#define PMAP_REMOVE_ALL                0       /* remove all mappings */
+#define PMAP_REMOVE_SKIPWIRED  1       /* skip wired mappings */
+
+static vaddr_t          pmap_tmpmap_pa(paddr_t);
+static pt_entry_t      *pmap_tmpmap_pvepte(struct pv_entry *);
+static void             pmap_tmpunmap_pa(void);
+static void             pmap_tmpunmap_pvepte(struct pv_entry *);
+static void             pmap_unmap_ptes(struct pmap *);
+
+static boolean_t        pmap_reactivate(struct pmap *);
+
+#ifdef DEBUG
+u_int  curapdp;
+#endif
+
+/*
+ * p m a p   i n l i n e   h e l p e r   f u n c t i o n s
+ */
+
+/*
+ * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
+ *             of course the kernel is always loaded
+ */
+
+__inline static boolean_t
+pmap_is_curpmap(pmap)
+       struct pmap *pmap;
+{
+
+       return((pmap == pmap_kernel()) ||
+              (pmap == curcpu()->ci_pmap));
+}
+
+/*
+ * pmap_is_active: is this pmap loaded into the specified processor's %cr3?
+ */
+
+__inline static boolean_t
+pmap_is_active(pmap, cpu_id)
+       struct pmap *pmap;
+       int cpu_id;
+{
+
+       return (pmap == pmap_kernel() ||
+           (pmap->pm_cpus & (1U << cpu_id)) != 0);
+}
+
+/*
+ * pmap_tmpmap_pa: map a page in for tmp usage
+ */
+
+__inline static vaddr_t
+pmap_tmpmap_pa(pa)
+       paddr_t pa;
+{
+#ifdef MULTIPROCESSOR
+       int id = cpu_number();
+#endif
+       pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
+       pt_entry_t *maptp;
+       caddr_t ptpva = VASLEW(ptpp, id);
+#if defined(DIAGNOSTIC)
+       if (*ptpte)
+               panic("pmap_tmpmap_pa: ptp_pte in use?");
+#endif
+       maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte);
+       PTE_SET(ptpte, maptp, PG_V | PG_RW | pa); /* always a new mapping */
+       return((vaddr_t)ptpva);
+}
+
+/*
+ * pmap_tmpunmap_pa: unmap a tmp use page (undoes pmap_tmpmap_pa)
+ */
+
+__inline static void
+pmap_tmpunmap_pa()
+{
+#ifdef MULTIPROCESSOR
+       int id = cpu_number();
+#endif
+       pt_entry_t *ptpte = PTESLEW(ptp_pte, id);
+       pt_entry_t *maptp;
+       caddr_t ptpva = VASLEW(ptpp, id);
+#if defined(DIAGNOSTIC)
+       if (!pmap_valid_entry(*ptp_pte))
+               panic("pmap_tmpunmap_pa: our pte invalid?");
+#endif
+       maptp = (pt_entry_t *)vtomach((vaddr_t)ptpte);
+       PTE_CLEAR(ptpte, maptp);                /* zap! */
+       pmap_update_pg((vaddr_t)ptpva);
+#ifdef MULTIPROCESSOR
+       /*
+        * No need for tlb shootdown here, since ptp_pte is per-CPU.
+        */
+#endif
+}
+
+/*
+ * pmap_tmpmap_pvepte: get a quick mapping of a PTE for a pv_entry
+ *
+ * => do NOT use this on kernel mappings [why?  because pv_ptp may be NULL]
+ */
+
+__inline static pt_entry_t *
+pmap_tmpmap_pvepte(pve)
+       struct pv_entry *pve;
+{
+#ifdef DIAGNOSTIC
+       if (pve->pv_pmap == pmap_kernel())
+               panic("pmap_tmpmap_pvepte: attempt to map kernel");
+#endif
+
+       /* is it current pmap?  use direct mapping... */
+       if (pmap_is_curpmap(pve->pv_pmap))
+               return(vtopte(pve->pv_va));
+
+       return(((pt_entry_t *)pmap_tmpmap_pa(VM_PAGE_TO_PHYS(pve->pv_ptp)))
+              + ptei((unsigned)pve->pv_va));
+}
+
+/*
+ * pmap_tmpunmap_pvepte: release a mapping obtained with pmap_tmpmap_pvepte
+ */
+
+__inline static void
+pmap_tmpunmap_pvepte(pve)
+       struct pv_entry *pve;
+{
+       /* was it current pmap?   if so, return */
+       if (pmap_is_curpmap(pve->pv_pmap))
+               return;
+
+       pmap_tmpunmap_pa();
+}
+
+__inline static void
+pmap_apte_flush(struct pmap *pmap)
+{
+#if defined(MULTIPROCESSOR)
+       struct pmap_tlb_shootdown_q *pq;
+       struct cpu_info *ci, *self = curcpu();
+       CPU_INFO_ITERATOR cii;
+       int s;
+#endif
+
+       tlbflush();             /* flush TLB on current processor */
+#if defined(MULTIPROCESSOR)
+       /*
+        * Flush the APTE mapping from all other CPUs that
+        * are using the pmap we are using (who's APTE space
+        * is the one we've just modified).
+        *
+        * XXXthorpej -- find a way to defer the IPI.
+        */
+       for (CPU_INFO_FOREACH(cii, ci)) {
+               if (ci == self)
+                       continue;
+               if (pmap_is_active(pmap, ci->ci_cpuid)) {
+                       pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
+                       s = splipi();
+                       __cpu_simple_lock(&pq->pq_slock);
+                       pq->pq_flushu++;
+                       __cpu_simple_unlock(&pq->pq_slock);
+                       splx(s);
+                       x86_send_ipi(ci, X86_IPI_TLB);
+               }
+       }
+#endif
+}
+
+/*
+ * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
+ *
+ * => we lock enough pmaps to keep things locked in
+ * => must be undone with pmap_unmap_ptes before returning
+ */
+
+__inline static pt_entry_t *
+pmap_map_ptes(pmap)
+       struct pmap *pmap;
+{
+       pd_entry_t opde;
+       pd_entry_t *mapdp;
+       struct pmap *ourpmap;
+       struct cpu_info *ci;
+
+       /* the kernel's pmap is always accessible */
+       if (pmap == pmap_kernel()) {
+               return(PTE_BASE);
+       }
+
+       ci = curcpu();
+       if (ci->ci_want_pmapload &&
+           vm_map_pmap(&ci->ci_curlwp->l_proc->p_vmspace->vm_map) == pmap)
+               pmap_load();
+
+       /* if curpmap then we are always mapped */
+       if (pmap_is_curpmap(pmap)) {
+               simple_lock(&pmap->pm_obj.vmobjlock);
+               return(PTE_BASE);
+       }
+
+       ourpmap = ci->ci_pmap;
+
+       /* need to lock both curpmap and pmap: use ordered locking */
+       if ((unsigned) pmap < (unsigned) ourpmap) {
+               simple_lock(&pmap->pm_obj.vmobjlock);
+               simple_lock(&ourpmap->pm_obj.vmobjlock);
+       } else {
+               simple_lock(&ourpmap->pm_obj.vmobjlock);
+               simple_lock(&pmap->pm_obj.vmobjlock);
+       }
+
+       /* need to load a new alternate pt space into curpmap? */
+       COUNT(apdp_pde_map);
+       opde = PDE_GET(APDP_PDE);
+       if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) {
+               XENPRINTF(("APDP_PDE %p %p/%p set %p/%p\n",
+                          pmap,
+                          (void *)vtophys((vaddr_t)APDP_PDE),
+                          (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)),
+                          (void *)pmap->pm_pdirpa,
+                          (void *)xpmap_ptom(pmap->pm_pdirpa)));
+               mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
+               PDE_SET(APDP_PDE, mapdp, pmap->pm_pdirpa /* | PG_RW */ | PG_V);
+#ifdef DEBUG
+               curapdp = pmap->pm_pdirpa;
+#endif
+               if (pmap_valid_entry(opde))
+                       pmap_apte_flush(ourpmap);
+               XENPRINTF(("APDP_PDE set done\n"));
+       }
+       return(APTE_BASE);
+}
+
+/*
+ * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
+ */
+
+__inline static void
+pmap_unmap_ptes(pmap)
+       struct pmap *pmap;
+{
+#if defined(MULTIPROCESSOR)
+       pd_entry_t *mapdp;
+#endif
+
+       if (pmap == pmap_kernel()) {
+               return;
+       }
+       if (pmap_is_curpmap(pmap)) {
+               simple_unlock(&pmap->pm_obj.vmobjlock);
+       } else {
+               struct pmap *ourpmap = curcpu()->ci_pmap;
+
+#if defined(MULTIPROCESSOR)
+               mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
+               PDE_CLEAR(APDP_PDE, mapdp);
+               pmap_apte_flush(ourpmap);
+#endif
+#ifdef DEBUG
+               curapdp = 0;
+#endif
+               XENPRINTF(("APDP_PDE clear %p/%p set %p/%p\n",
+                          (void *)vtophys((vaddr_t)APDP_PDE),
+                          (void *)xpmap_ptom(vtophys((vaddr_t)APDP_PDE)),
+                          (void *)pmap->pm_pdirpa,
+                          (void *)xpmap_ptom(pmap->pm_pdirpa)));
+               COUNT(apdp_pde_unmap);
+               simple_unlock(&pmap->pm_obj.vmobjlock);
+               simple_unlock(&ourpmap->pm_obj.vmobjlock);
+       }
+}
+
+__inline static void
+pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
+{
+       if (curproc == NULL || curproc->p_vmspace == NULL ||
+           pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
+               return;
+
+       if ((opte ^ npte) & PG_X)
+               pmap_update_pg(va);
+
+       /*
+        * Executability was removed on the last executable change.
+        * Reset the code segment to something conservative and
+        * let the trap handler deal with setting the right limit.
+        * We can't do that because of locking constraints on the vm map.
+        */
+
+       if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
+               struct trapframe *tf = curlwp->l_md.md_regs;
+               struct pcb *pcb = &curlwp->l_addr->u_pcb;
+
+               pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
+               pm->pm_hiexec = I386_MAX_EXE_ADDR;
+       }
+}
+
+__inline static pt_entry_t
+pte_mtop(pt_entry_t pte)
+{
+       pt_entry_t ppte;
+
+       KDASSERT(pmap_valid_entry(pte));
+       ppte = xpmap_mtop(pte);
+       if ((ppte & PG_FRAME) == XPMAP_OFFSET) {
+               XENPRINTF(("pte_mtop: null page %08x -> %08x\n",
+                   ppte, pte));
+               ppte = pte;
+       }
+
+       return ppte;
+}
+
+__inline static pt_entry_t
+pte_get_ma(pt_entry_t *pte)
+{
+
+       return *pte;
+}
+
+__inline static pt_entry_t
+pte_get(pt_entry_t *pte)
+{
+
+       if (pmap_valid_entry(*pte))
+               return pte_mtop(*pte);
+       return *pte;
+}
+
+__inline static pt_entry_t
+pte_atomic_update_ma(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte)
+{
+       pt_entry_t opte;
+
+       XENPRINTK(("pte_atomic_update_ma pte %p mapte %p npte %08x\n",
+                  pte, mapte, npte));
+       opte = PTE_GET_MA(pte);
+       if (opte > pmap_mem_end) {
+               /* must remove opte unchecked */
+               if (npte > pmap_mem_end)
+                       /* must set npte unchecked */
+                       xpq_queue_unchecked_pte_update(mapte, npte);
+               else {
+                       /* must set npte checked */
+                       xpq_queue_unchecked_pte_update(mapte, 0);
+                       xpq_queue_pte_update(mapte, npte);
+               }
+       } else {
+               /* must remove opte checked */
+               if (npte > pmap_mem_end) {
+                       /* must set npte unchecked */
+                       xpq_queue_pte_update(mapte, 0);
+                       xpq_queue_unchecked_pte_update(mapte, npte);
+               } else
+                       /* must set npte checked */
+                       xpq_queue_pte_update(mapte, npte);
+       }
+       xpq_flush_queue();
+
+       return opte;
+}
+
+__inline static pt_entry_t
+pte_atomic_update(pt_entry_t *pte, pt_entry_t *mapte, pt_entry_t npte)
+{
+       pt_entry_t opte;
+
+       opte = pte_atomic_update_ma(pte, mapte, npte);
+
+       return pte_mtop(opte);
+}
+
+/*
+ * Fixup the code segment to cover all potential executable mappings.
+ * returns 0 if no changes to the code segment were made.
+ */
+
+int
+pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
+{
+       struct vm_map_entry *ent;
+       struct pmap *pm = vm_map_pmap(map);
+       vaddr_t va = 0;
+
+       vm_map_lock_read(map);
+       for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
+
+               /*
+                * This entry has greater va than the entries before.
+                * We need to make it point to the last page, not past it.
+                */
+
+               if (ent->protection & VM_PROT_EXECUTE)
+                       va = trunc_page(ent->end) - PAGE_SIZE;
+       }
+       vm_map_unlock_read(map);
+       if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
+               return (0);
+
+       pm->pm_hiexec = va;
+       if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
+               pcb->pcb_cs = tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
+       } else {
+               pcb->pcb_cs = tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
+               return (0);
+       }
+       return (1);
+}
+
+/*
+ * p m a p   k e n t e r   f u n c t i o n s
+ *
+ * functions to quickly enter/remove pages from the kernel address
+ * space.   pmap_kremove is exported to MI kernel.  we make use of
+ * the recursive PTE mappings.
+ */
+
+/*
+ * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
+ *
+ * => no need to lock anything, assume va is already allocated
+ * => should be faster than normal pmap enter function
+ */
+
+void
+pmap_kenter_pa(va, pa, prot)
+       vaddr_t va;
+       paddr_t pa;
+       vm_prot_t prot;
+{
+       pt_entry_t *pte, opte, npte;
+       pt_entry_t *maptp;
+
+       if (va < VM_MIN_KERNEL_ADDRESS)
+               pte = vtopte(va);
+       else
+               pte = kvtopte(va);
+
+       npte = ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
+            PG_V | pmap_pg_g;
+
+       if (pa >= pmap_pa_start && pa < pmap_pa_end) {
+               npte |= xpmap_ptom(pa);
+       } else {
+               XENPRINTF(("pmap_kenter: va %08lx outside pa range %08lx\n",
+                             va, pa));
+               npte |= pa;
+       }
+
+       maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+       opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */
+       XENPRINTK(("pmap_kenter_pa(%p,%p) %p, was %08x now %08x\n", (void *)va, 
+                     (void *)pa, pte, opte, npte));
+#ifdef LARGEPAGES
+       /* XXX For now... */
+       if (opte & PG_PS)
+               panic("pmap_kenter_pa: PG_PS");
+#endif
+       if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
+#if defined(MULTIPROCESSOR)
+               int32_t cpumask = 0;
+
+               pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
+               pmap_tlb_shootnow(cpumask);
+#else
+               /* Don't bother deferring in the single CPU case. */
+               pmap_update_pg(va);
+#endif
+       }
+}
+
+/*
+ * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
+ *
+ * => no need to lock anything, assume va is already allocated
+ * => should be faster than normal pmap enter function
+ */
+
+void            pmap_kenter_ma __P((vaddr_t, paddr_t, vm_prot_t));
+
+void
+pmap_kenter_ma(va, ma, prot)
+       vaddr_t va;
+       paddr_t ma;
+       vm_prot_t prot;
+{
+       pt_entry_t *pte, opte, npte;
+       pt_entry_t *maptp;
+
+       KASSERT (va >= VM_MIN_KERNEL_ADDRESS);
+       pte = kvtopte(va);
+
+       npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
+            PG_V | pmap_pg_g;
+
+       maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+       opte = pte_atomic_update_ma(pte, maptp, npte); /* zap! */
+       XENPRINTK(("pmap_kenter_ma(%p,%p) %p, was %08x\n", (void *)va,
+                     (void *)ma, pte, opte));
+#ifdef LARGEPAGES
+       /* XXX For now... */
+       if (opte & PG_PS)
+               panic("pmap_kenter_ma: PG_PS");
+#endif
+       if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
+#if defined(MULTIPROCESSOR)
+               int32_t cpumask = 0;
+
+               pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
+               pmap_tlb_shootnow(cpumask);
+#else
+               /* Don't bother deferring in the single CPU case. */
+               pmap_update_pg(va);
+#endif
+       }
+}
+
+/*
+ * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
+ *
+ * => no need to lock anything
+ * => caller must dispose of any vm_page mapped in the va range
+ * => note: not an inline function
+ * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
+ * => we assume kernel only unmaps valid addresses and thus don't bother
+ *    checking the valid bit before doing TLB flushing
+ */
+
+void
+pmap_kremove(va, len)
+       vaddr_t va;
+       vsize_t len;
+{
+       pt_entry_t *pte, opte;
+       pt_entry_t *maptp;
+       int32_t cpumask = 0;
+
+       XENPRINTK(("pmap_kremove va %p, len %08lx\n", (void *)va, len));
+       len >>= PAGE_SHIFT;
+       for ( /* null */ ; len ; len--, va += PAGE_SIZE) {
+               if (va < VM_MIN_KERNEL_ADDRESS)
+                       pte = vtopte(va);
+               else
+                       pte = kvtopte(va);
+               maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+               opte = pte_atomic_update_ma(pte, maptp, 0); /* zap! */
+               XENPRINTK(("pmap_kremove pte %p, was %08x\n", pte, opte));
+#ifdef LARGEPAGES
+               /* XXX For now... */
+               if (opte & PG_PS)
+                       panic("pmap_kremove: PG_PS");
+#endif
+#ifdef DIAGNOSTIC
+               if (opte & PG_PVLIST)
+                       panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
+                             va);
+#endif
+               if ((opte & (PG_V | PG_U)) == (PG_V | PG_U))
+                       pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
+       }
+       pmap_tlb_shootnow(cpumask);
+}
+
+/*
+ * p m a p   i n i t   f u n c t i o n s
+ *
+ * pmap_bootstrap and pmap_init are called during system startup
+ * to init the pmap module.   pmap_bootstrap() does a low level
+ * init just to get things rolling.   pmap_init() finishes the job.
+ */
+
+/*
+ * pmap_bootstrap: get the system in a state where it can run with VM
+ *     properly enabled (called before main()).   the VM system is
+ *      fully init'd later...
+ *
+ * => on i386, locore.s has already enabled the MMU by allocating
+ *     a PDP for the kernel, and nkpde PTP's for the kernel.
+ * => kva_start is the first free virtual address in kernel space
+ */
+
+void
+pmap_bootstrap(kva_start)
+       vaddr_t kva_start;
+{
+       struct pmap *kpm;
+       vaddr_t kva;
+       pt_entry_t *pte;
+       pt_entry_t *maptp;
+       int i;
+
+       /*
+        * set up our local static global vars that keep track of the
+        * usage of KVM before kernel_map is set up
+        */
+
+       virtual_avail = kva_start;              /* first free KVA */
+       virtual_end = VM_MAX_KERNEL_ADDRESS;    /* last KVA */
+
+       /*
+        * find out where physical memory ends on the real hardware.
+        */
+
+       if (xen_start_info.flags & SIF_PRIVILEGED)
+               pmap_mem_end = find_pmap_mem_end(kva_start);
+
+       /*
+        * set up protection_codes: we need to be able to convert from
+        * a MI protection code (some combo of VM_PROT...) to something
+        * we can jam into a i386 PTE.
+        */
+
+       protection_codes[VM_PROT_NONE] = 0;                     /* --- */
+       protection_codes[VM_PROT_EXECUTE] = PG_X;               /* --x */
+       protection_codes[VM_PROT_READ] = PG_RO;                 /* -r- */
+       protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO|PG_X;/* -rx */
+       protection_codes[VM_PROT_WRITE] = PG_RW;                /* w-- */
+       protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW|PG_X;/* w-x */
+       protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW;   /* wr- */
+       protection_codes[VM_PROT_ALL] = PG_RW|PG_X;             /* wrx */
+
+       /*
+        * now we init the kernel's pmap
+        *
+        * the kernel pmap's pm_obj is not used for much.   however, in
+        * user pmaps the pm_obj contains the list of active PTPs.
+        * the pm_obj currently does not have a pager.   it might be possible
+        * to add a pager that would allow a process to read-only mmap its
+        * own page tables (fast user level vtophys?).   this may or may not
+        * be useful.
+        */
+
+       kpm = pmap_kernel();
+       simple_lock_init(&kpm->pm_obj.vmobjlock);
+       kpm->pm_obj.pgops = NULL;
+       TAILQ_INIT(&kpm->pm_obj.memq);
+       kpm->pm_obj.uo_npages = 0;
+       kpm->pm_obj.uo_refs = 1;
+       memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
+       kpm->pm_pdir = (pd_entry_t *)(lwp0.l_addr->u_pcb.pcb_cr3 + KERNBASE);
+       XENPRINTF(("pm_pdirpa %p PTDpaddr %p\n",
+           (void *)lwp0.l_addr->u_pcb.pcb_cr3, (void *)PTDpaddr));
+       kpm->pm_pdirpa = (u_int32_t) lwp0.l_addr->u_pcb.pcb_cr3;
+       kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
+               x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
+
+       /*
+        * the above is just a rough estimate and not critical to the proper
+        * operation of the system.
+        */
+
+       /*
+        * Begin to enable global TLB entries if they are supported.
+        * The G bit has no effect until the CR4_PGE bit is set in CR4,
+        * which happens in cpu_init(), which is run on each cpu
+        * (and happens later)
+        */
+
+       if (cpu_feature & CPUID_PGE) {
+               pmap_pg_g = PG_G;               /* enable software */
+
+               /* add PG_G attribute to already mapped kernel pages */
+               for (kva = VM_MIN_KERNEL_ADDRESS ; kva < virtual_avail ;
+                    kva += PAGE_SIZE)
+                       if (pmap_valid_entry(PTE_BASE[x86_btop(kva)])) {
+#if !defined(XEN)
+                               PTE_BASE[x86_btop(kva)] |= PG_G;
+#else
+                               maptp = (pt_entry_t *)vtomach(
+                                       (vaddr_t)&PTE_BASE[x86_btop(kva)]);
+                               PTE_SETBITS(&PTE_BASE[x86_btop(kva)], maptp,
+                                   PG_G);
+                       }
+               PTE_UPDATES_FLUSH();
+#endif
+       }
+
+#ifdef LARGEPAGES
+       /*
+        * enable large pages if they are supported.
+        */
+
+       if (cpu_feature & CPUID_PSE) {
+               paddr_t pa;
+               vaddr_t kva_end;
+               pd_entry_t *pde;
+               pd_entry_t *mapdp;
+               extern char _etext;
+
+               lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */
+               pmap_largepages = 1;    /* enable software */
+
+               /*
+                * the TLB must be flushed after enabling large pages
+                * on Pentium CPUs, according to section 3.6.2.2 of
+                * "Intel Architecture Software Developer's Manual,
+                * Volume 3: System Programming".
+                */
+               tlbflush();
+
+               /*
+                * now, remap the kernel text using large pages.  we
+                * assume that the linker has properly aligned the
+                * .data segment to a 4MB boundary.
+                */
+               kva_end = roundup((vaddr_t)&_etext, NBPD);
+               for (pa = 0, kva = KERNBASE; kva < kva_end;
+                    kva += NBPD, pa += NBPD) {
+                       pde = &kpm->pm_pdir[pdei(kva)];
+                       mapdp = (pt_entry_t *)vtomach((vaddr_t)pde);
+                       PDE_SET(pde, mapdp, pa | pmap_pg_g | PG_PS |
+                           PG_KR | PG_V); /* zap! */
+                       tlbflush();
+               }
+       }
+#endif /* LARGEPAGES */
+
+       /*
+        * now we allocate the "special" VAs which are used for tmp mappings
+        * by the pmap (and other modules).    we allocate the VAs by advancing
+        * virtual_avail (note that there are no pages mapped at these VAs).
+        * we find the PTE that maps the allocated VA via the linear PTE
+        * mapping.
+        */
+
+       pte = PTE_BASE + x86_btop(virtual_avail);
+
+#ifdef MULTIPROCESSOR
+       /*
+        * Waste some VA space to avoid false sharing of cache lines
+        * for page table pages: Give each possible CPU a cache line
+        * of PTE's (8) to play with, though we only need 4.  We could
+        * recycle some of this waste by putting the idle stacks here
+        * as well; we could waste less space if we knew the largest
+        * CPU ID beforehand.
+        */
+       csrcp = (caddr_t) virtual_avail;  csrc_pte = pte;
+
+       cdstp = (caddr_t) virtual_avail+PAGE_SIZE;  cdst_pte = pte+1;
+
+       zerop = (caddr_t) virtual_avail+PAGE_SIZE*2;  zero_pte = pte+2;
+
+       ptpp = (caddr_t) virtual_avail+PAGE_SIZE*3;  ptp_pte = pte+3;
+
+       virtual_avail += PAGE_SIZE * X86_MAXPROCS * NPTECL;
+       pte += X86_MAXPROCS * NPTECL;
+#else
+       csrcp = (caddr_t) virtual_avail;  csrc_pte = pte;  /* allocate */
+       virtual_avail += PAGE_SIZE; pte++;                           /* advance */
+
+       cdstp = (caddr_t) virtual_avail;  cdst_pte = pte;
+       virtual_avail += PAGE_SIZE; pte++;
+
+       zerop = (caddr_t) virtual_avail;  zero_pte = pte;
+       virtual_avail += PAGE_SIZE; pte++;
+
+       ptpp = (caddr_t) virtual_avail;  ptp_pte = pte;
+       virtual_avail += PAGE_SIZE; pte++;
+#endif
+
+       XENPRINTK(("pmap_bootstrap csrcp %p cdstp %p zerop %p ptpp %p\n", 
+                     csrc_pte, cdst_pte, zero_pte, ptp_pte));
+       /*
+        * Nothing after this point actually needs pte;
+        */
+       pte = (void *)0xdeadbeef;
+
+       /* XXX: vmmap used by mem.c... should be uvm_map_reserve */
+       vmmap = (char *)virtual_avail;                  /* don't need pte */
+       virtual_avail += PAGE_SIZE;
+
+       msgbuf_vaddr = virtual_avail;                   /* don't need pte */
+       virtual_avail += round_page(MSGBUFSIZE);
+
+       idt_vaddr = virtual_avail;                      /* don't need pte */
+       virtual_avail += PAGE_SIZE;
+       idt_paddr = avail_start;                        /* steal a page */
+       avail_start += PAGE_SIZE;
+
+#if defined(I586_CPU)
+       /* pentium f00f bug stuff */
+       pentium_idt_vaddr = virtual_avail;              /* don't need pte */
+       virtual_avail += PAGE_SIZE;
+#endif
+
+       /*
+        * now we reserve some VM for mapping pages when doing a crash dump
+        */
+
+       virtual_avail = reserve_dumppages(virtual_avail);
+
+       /*
+        * init the static-global locks and global lists.
+        */
+
+#if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
+       spinlockinit(&pmap_main_lock, "pmaplk", 0);
+#endif
+       simple_lock_init(&pvalloc_lock);
+       simple_lock_init(&pmaps_lock);
+       LIST_INIT(&pmaps);
+       TAILQ_INIT(&pv_freepages);
+       TAILQ_INIT(&pv_unusedpgs);
+
+       /*
+        * initialize the pmap pool.
+        */
+
+       pool_init(&pmap_pmap_pool, sizeof(struct pmap), 0, 0, 0, "pmappl",
+           &pool_allocator_nointr);
+
+       /*
+        * Initialize the TLB shootdown queues.
+        */
+
+       __cpu_simple_lock_init(&pmap_tlb_shootdown_job_lock);
+
+       for (i = 0; i < X86_MAXPROCS; i++) {
+               TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
+               __cpu_simple_lock_init(&pmap_tlb_shootdown_q[i].pq_slock);
+       }
+
+       /*
+        * initialize the PDE pool and cache.
+        */
+       pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, 0, "pdppl",
+                 &pool_allocator_nointr);
+       pool_cache_init(&pmap_pdp_cache, &pmap_pdp_pool,
+                       pmap_pdp_ctor, pmap_pdp_dtor, NULL);
+
+       /*
+        * ensure the TLB is sync'd with reality by flushing it...
+        */
+
+       tlbflush();
+}
+
+/*
+ * pmap_init: called from uvm_init, our job is to get the pmap
+ * system ready to manage mappings... this mainly means initing
+ * the pv_entry stuff.
+ */
+
+void
+pmap_init()
+{
+       int i;
+
+       /*
+        * now we need to free enough pv_entry structures to allow us to get
+        * the kmem_map/kmem_object allocated and inited (done after this
+        * function is finished).  to do this we allocate one bootstrap page out
+        * of kernel_map and use it to provide an initial pool of pv_entry
+        * structures.   we never free this page.
+        */
+
+       pv_initpage = (struct pv_page *) uvm_km_alloc(kernel_map, PAGE_SIZE);
+       if (pv_initpage == NULL)
+               panic("pmap_init: pv_initpage");
+       pv_cachedva = 0;   /* a VA we have allocated but not used yet */
+       pv_nfpvents = 0;
+       (void) pmap_add_pvpage(pv_initpage, FALSE);
+
+       pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE);
+       if (pj_page == NULL)
+               panic("pmap_init: pj_page");
+
+       for (i = 0;
+            i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1);
+            i++)
+               pj_page[i].pja_job.pj_nextfree = &pj_page[i + 1].pja_job;
+       pj_page[i].pja_job.pj_nextfree = NULL;
+       pj_free = &pj_page[0];
+
+       /*
+        * done: pmap module is up (and ready for business)
+        */
+
+       pmap_initialized = TRUE;
+}
+
+/*
+ * p v _ e n t r y   f u n c t i o n s
+ */
+
+/*
+ * pv_entry allocation functions:
+ *   the main pv_entry allocation functions are:
+ *     pmap_alloc_pv: allocate a pv_entry structure
+ *     pmap_free_pv: free one pv_entry
+ *     pmap_free_pvs: free a list of pv_entrys
+ *
+ * the rest are helper functions
+ */
+
+/*
+ * pmap_alloc_pv: inline function to allocate a pv_entry structure
+ * => we lock pvalloc_lock
+ * => if we fail, we call out to pmap_alloc_pvpage
+ * => 3 modes:
+ *    ALLOCPV_NEED   = we really need a pv_entry, even if we have to steal it
+ *    ALLOCPV_TRY    = we want a pv_entry, but not enough to steal
+ *    ALLOCPV_NONEED = we are trying to grow our free list, don't really need
+ *                     one now
+ *
+ * "try" is for optional functions like pmap_copy().
+ */
+
+__inline static struct pv_entry *
+pmap_alloc_pv(pmap, mode)
+       struct pmap *pmap;
+       int mode;
+{
+       struct pv_page *pvpage;
+       struct pv_entry *pv;
+
+       simple_lock(&pvalloc_lock);
+
+       pvpage = TAILQ_FIRST(&pv_freepages);
+       if (pvpage != NULL) {
+               pvpage->pvinfo.pvpi_nfree--;
+               if (pvpage->pvinfo.pvpi_nfree == 0) {
+                       /* nothing left in this one? */
+                       TAILQ_REMOVE(&pv_freepages, pvpage, pvinfo.pvpi_list);
+               }
+               pv = pvpage->pvinfo.pvpi_pvfree;
+               KASSERT(pv);
+               pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
+               pv_nfpvents--;  /* took one from pool */
+       } else {
+               pv = NULL;              /* need more of them */
+       }
+
+       /*
+        * if below low water mark or we didn't get a pv_entry we try and
+        * create more pv_entrys ...
+        */
+
+       if (pv_nfpvents < PVE_LOWAT || pv == NULL) {
+               if (pv == NULL)
+                       pv = pmap_alloc_pvpage(pmap, (mode == ALLOCPV_TRY) ?
+                                              mode : ALLOCPV_NEED);
+               else
+                       (void) pmap_alloc_pvpage(pmap, ALLOCPV_NONEED);
+       }
+       simple_unlock(&pvalloc_lock);
+       return(pv);
+}
+
+/*
+ * pmap_alloc_pvpage: maybe allocate a new pvpage
+ *
+ * if need_entry is false: try and allocate a new pv_page
+ * if need_entry is true: try and allocate a new pv_page and return a
+ *     new pv_entry from it.   if we are unable to allocate a pv_page
+ *     we make a last ditch effort to steal a pv_page from some other
+ *     mapping.    if that fails, we panic...
+ *
+ * => we assume that the caller holds pvalloc_lock
+ */
+
+static struct pv_entry *
+pmap_alloc_pvpage(pmap, mode)
+       struct pmap *pmap;
+       int mode;
+{
+       struct vm_page *pg;
+       struct pv_page *pvpage;
+       struct pv_entry *pv;
+       int s;
+
+       /*
+        * if we need_entry and we've got unused pv_pages, allocate from there
+        */
+
+       pvpage = TAILQ_FIRST(&pv_unusedpgs);
+       if (mode != ALLOCPV_NONEED && pvpage != NULL) {
+
+               /* move it to pv_freepages list */
+               TAILQ_REMOVE(&pv_unusedpgs, pvpage, pvinfo.pvpi_list);
+               TAILQ_INSERT_HEAD(&pv_freepages, pvpage, pvinfo.pvpi_list);
+
+               /* allocate a pv_entry */
+               pvpage->pvinfo.pvpi_nfree--;    /* can't go to zero */
+               pv = pvpage->pvinfo.pvpi_pvfree;
+               KASSERT(pv);
+               pvpage->pvinfo.pvpi_pvfree = SPLAY_RIGHT(pv, pv_node);
+               pv_nfpvents--;  /* took one from pool */
+               return(pv);
+       }
+
+       /*
+        *  see if we've got a cached unmapped VA that we can map a page in.
+        * if not, try to allocate one.
+        */
+
+       if (pv_cachedva == 0) {
+               s = splvm();   /* must protect kmem_map with splvm! */
+               pv_cachedva = uvm_km_kmemalloc(kmem_map, NULL, PAGE_SIZE,
+                   UVM_KMF_TRYLOCK|UVM_KMF_VALLOC);
+               splx(s);
+               if (pv_cachedva == 0) {
+                       return (NULL);
+               }
+       }
+
+       pg = uvm_pagealloc(NULL, pv_cachedva - vm_map_min(kernel_map), NULL,
+           UVM_PGA_USERESERVE);
+       if (pg == NULL)
+               return (NULL);
+       pg->flags &= ~PG_BUSY;  /* never busy */
+
+       /*
+        * add a mapping for our new pv_page and free its entrys (save one!)
+        *
+        * NOTE: If we are allocating a PV page for the kernel pmap, the
+        * pmap is already locked!  (...but entering the mapping is safe...)
+        */
+
+       pmap_kenter_pa(pv_cachedva, VM_PAGE_TO_PHYS(pg),
+           VM_PROT_READ | VM_PROT_WRITE);
+       pmap_update(pmap_kernel());
+       pvpage = (struct pv_page *) pv_cachedva;
+       pv_cachedva = 0;
+       return (pmap_add_pvpage(pvpage, mode != ALLOCPV_NONEED));
+}
+
+/*
+ * pmap_add_pvpage: add a pv_page's pv_entrys to the free list
+ *
+ * => caller must hold pvalloc_lock
+ * => if need_entry is true, we allocate and return one pv_entry
+ */
+
+static struct pv_entry *
+pmap_add_pvpage(pvp, need_entry)
+       struct pv_page *pvp;
+       boolean_t need_entry;
+{
+       int tofree, lcv;
+
+       /* do we need to return one? */
+       tofree = (need_entry) ? PVE_PER_PVPAGE - 1 : PVE_PER_PVPAGE;
+
+       pvp->pvinfo.pvpi_pvfree = NULL;
+       pvp->pvinfo.pvpi_nfree = tofree;
+       for (lcv = 0 ; lcv < tofree ; lcv++) {
+               SPLAY_RIGHT(&pvp->pvents[lcv], pv_node) =
+                       pvp->pvinfo.pvpi_pvfree;
+               pvp->pvinfo.pvpi_pvfree = &pvp->pvents[lcv];
+       }
+       if (need_entry)
+               TAILQ_INSERT_TAIL(&pv_freepages, pvp, pvinfo.pvpi_list);
+       else
+               TAILQ_INSERT_TAIL(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
+       pv_nfpvents += tofree;
+       return((need_entry) ? &pvp->pvents[lcv] : NULL);
+}
+
+/*
+ * pmap_free_pv_doit: actually free a pv_entry
+ *
+ * => do not call this directly!  instead use either
+ *    1. pmap_free_pv ==> free a single pv_entry
+ *    2. pmap_free_pvs => free a list of pv_entrys
+ * => we must be holding pvalloc_lock
+ */
+
+__inline static void
+pmap_free_pv_doit(pv)
+       struct pv_entry *pv;
+{
+       struct pv_page *pvp;
+
+       pvp = (struct pv_page *) x86_trunc_page(pv);
+       pv_nfpvents++;
+       pvp->pvinfo.pvpi_nfree++;
+
+       /* nfree == 1 => fully allocated page just became partly allocated */
+       if (pvp->pvinfo.pvpi_nfree == 1) {
+               TAILQ_INSERT_HEAD(&pv_freepages, pvp, pvinfo.pvpi_list);
+       }
+
+       /* free it */
+       SPLAY_RIGHT(pv, pv_node) = pvp->pvinfo.pvpi_pvfree;
+       pvp->pvinfo.pvpi_pvfree = pv;
+
+       /*
+        * are all pv_page's pv_entry's free?  move it to unused queue.
+        */
+
+       if (pvp->pvinfo.pvpi_nfree == PVE_PER_PVPAGE) {
+               TAILQ_REMOVE(&pv_freepages, pvp, pvinfo.pvpi_list);
+               TAILQ_INSERT_HEAD(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
+       }
+}
+
+/*
+ * pmap_free_pv: free a single pv_entry
+ *
+ * => we gain the pvalloc_lock
+ */
+
+__inline static void
+pmap_free_pv(pmap, pv)
+       struct pmap *pmap;
+       struct pv_entry *pv;
+{
+       simple_lock(&pvalloc_lock);
+       pmap_free_pv_doit(pv);
+
+       /*
+        * Can't free the PV page if the PV entries were associated with
+        * the kernel pmap; the pmap is already locked.
+        */
+       if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL &&
+           pmap != pmap_kernel())
+               pmap_free_pvpage();
+
+       simple_unlock(&pvalloc_lock);
+}
+
+/*
+ * pmap_free_pvs: free a list of pv_entrys
+ *
+ * => we gain the pvalloc_lock
+ */
+
+__inline static void
+pmap_free_pvs(pmap, pvs)
+       struct pmap *pmap;
+       struct pv_entry *pvs;
+{
+       struct pv_entry *nextpv;
+
+       simple_lock(&pvalloc_lock);
+
+       for ( /* null */ ; pvs != NULL ; pvs = nextpv) {
+               nextpv = SPLAY_RIGHT(pvs, pv_node);
+               pmap_free_pv_doit(pvs);
+       }
+
+       /*
+        * Can't free the PV page if the PV entries were associated with
+        * the kernel pmap; the pmap is already locked.
+        */
+       if (pv_nfpvents > PVE_HIWAT && TAILQ_FIRST(&pv_unusedpgs) != NULL &&
+           pmap != pmap_kernel())
+               pmap_free_pvpage();
+
+       simple_unlock(&pvalloc_lock);
+}
+
+
+/*
+ * pmap_free_pvpage: try and free an unused pv_page structure
+ *
+ * => assume caller is holding the pvalloc_lock and that
+ *     there is a page on the pv_unusedpgs list
+ * => if we can't get a lock on the kmem_map we try again later
+ */
+
+static void
+pmap_free_pvpage()
+{
+       int s;
+       struct vm_map *map;
+       struct vm_map_entry *dead_entries;
+       struct pv_page *pvp;
+
+       s = splvm(); /* protect kmem_map */
+
+       pvp = TAILQ_FIRST(&pv_unusedpgs);
+
+       /*
+        * note: watch out for pv_initpage which is allocated out of
+        * kernel_map rather than kmem_map.
+        */
+
+       if (pvp == pv_initpage)
+               map = kernel_map;
+       else
+               map = kmem_map;
+       if (vm_map_lock_try(map)) {
+
+               /* remove pvp from pv_unusedpgs */
+               TAILQ_REMOVE(&pv_unusedpgs, pvp, pvinfo.pvpi_list);
+
+               /* unmap the page */
+               dead_entries = NULL;
+               uvm_unmap_remove(map, (vaddr_t)pvp, ((vaddr_t)pvp) + PAGE_SIZE,
+                   &dead_entries);
+               vm_map_unlock(map);
+
+               if (dead_entries != NULL)
+                       uvm_unmap_detach(dead_entries, 0);
+
+               pv_nfpvents -= PVE_PER_PVPAGE;  /* update free count */
+       }
+       if (pvp == pv_initpage)
+               /* no more initpage, we've freed it */
+               pv_initpage = NULL;
+
+       splx(s);
+}
+
+/*
+ * pmap_lock_pvhs: Lock pvh1 and optional pvh2
+ *                 Observe locking order when locking both pvhs
+ */
+
+__inline static void
+pmap_lock_pvhs(struct pv_head *pvh1, struct pv_head *pvh2)
+{
+
+       if (pvh2 == NULL) {
+               simple_lock(&pvh1->pvh_lock);
+               return;
+       }
+
+       if (pvh1 < pvh2) {
+               simple_lock(&pvh1->pvh_lock);
+               simple_lock(&pvh2->pvh_lock);
+       } else {
+               simple_lock(&pvh2->pvh_lock);
+               simple_lock(&pvh1->pvh_lock);
+       }
+}
+
+
+/*
+ * main pv_entry manipulation functions:
+ *   pmap_enter_pv: enter a mapping onto a pv_head list
+ *   pmap_remove_pv: remove a mappiing from a pv_head list
+ *
+ * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock 
+ *       the pvh before calling
+ */
+
+/*
+ * pmap_enter_pv: enter a mapping onto a pv_head lst
+ *
+ * => caller should hold the proper lock on pmap_main_lock
+ * => caller should have pmap locked
+ * => caller should have the pv_head locked
+ * => caller should adjust ptp's wire_count before calling
+ */
+
+__inline static void
+pmap_enter_pv(pvh, pve, pmap, va, ptp)
+       struct pv_head *pvh;
+       struct pv_entry *pve;   /* preallocated pve for us to use */
+       struct pmap *pmap;
+       vaddr_t va;
+       struct vm_page *ptp;    /* PTP in pmap that maps this VA */
+{
+       pve->pv_pmap = pmap;
+       pve->pv_va = va;
+       pve->pv_ptp = ptp;                      /* NULL for kernel pmap */
+       SPLAY_INSERT(pvtree, &pvh->pvh_root, pve); /* add to locked list */
+}
+
+/*
+ * pmap_remove_pv: try to remove a mapping from a pv_list
+ *
+ * => caller should hold proper lock on pmap_main_lock
+ * => pmap should be locked
+ * => caller should hold lock on pv_head [so that attrs can be adjusted]
+ * => caller should adjust ptp's wire_count and free PTP if needed
+ * => we return the removed pve
+ */
+
+__inline static struct pv_entry *
+pmap_remove_pv(pvh, pmap, va)
+       struct pv_head *pvh;
+       struct pmap *pmap;
+       vaddr_t va;
+{
+       struct pv_entry tmp, *pve;
+
+       tmp.pv_pmap = pmap;
+       tmp.pv_va = va;
+       pve = SPLAY_FIND(pvtree, &pvh->pvh_root, &tmp);
+       if (pve == NULL)
+               return (NULL);
+       SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve);
+       return(pve);                            /* return removed pve */
+}
+
+/*
+ * p t p   f u n c t i o n s
+ */
+
+/*
+ * pmap_alloc_ptp: allocate a PTP for a PMAP
+ *
+ * => pmap should already be locked by caller
+ * => we use the ptp's wire_count to count the number of active mappings
+ *     in the PTP (we start it at one to prevent any chance this PTP
+ *     will ever leak onto the active/inactive queues)
+ */
+
+__inline static struct vm_page *
+pmap_alloc_ptp(pmap, pde_index)
+       struct pmap *pmap;
+       int pde_index;
+{
+       struct vm_page *ptp;
+       pd_entry_t *mapdp;
+
+       ptp = uvm_pagealloc(&pmap->pm_obj, ptp_i2o(pde_index), NULL,
+                           UVM_PGA_USERESERVE|UVM_PGA_ZERO);
+       if (ptp == NULL)
+               return(NULL);
+
+       /* got one! */
+       ptp->flags &= ~PG_BUSY; /* never busy */
+       ptp->wire_count = 1;    /* no mappings yet */
+       mapdp = (pt_entry_t *)vtomach((vaddr_t)&pmap->pm_pdir[pde_index]);
+       PDE_SET(&pmap->pm_pdir[pde_index], mapdp,
+           (pd_entry_t) (VM_PAGE_TO_PHYS(ptp) | PG_u | PG_RW | PG_V));
+       pmap->pm_stats.resident_count++;        /* count PTP as resident */
+       pmap->pm_ptphint = ptp;
+       return(ptp);
+}
+
+/*
+ * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
+ *
+ * => pmap should NOT be pmap_kernel()
+ * => pmap should be locked
+ */
+
+static struct vm_page *
+pmap_get_ptp(pmap, pde_index)
+       struct pmap *pmap;
+       int pde_index;
+{
+       struct vm_page *ptp;
+
+       if (pmap_valid_entry(pmap->pm_pdir[pde_index])) {
+
+               /* valid... check hint (saves us a PA->PG lookup) */
+               if (pmap->pm_ptphint &&
+                   (PDE_GET(&pmap->pm_pdir[pde_index]) & PG_FRAME) ==
+                   VM_PAGE_TO_PHYS(pmap->pm_ptphint))
+                       return(pmap->pm_ptphint);
+
+               ptp = uvm_pagelookup(&pmap->pm_obj, ptp_i2o(pde_index));
+#ifdef DIAGNOSTIC
+               if (ptp == NULL)
+                       panic("pmap_get_ptp: unmanaged user PTP");
+#endif
+               pmap->pm_ptphint = ptp;
+               return(ptp);
+       }
+
+       /* allocate a new PTP (updates ptphint) */
+       return(pmap_alloc_ptp(pmap, pde_index));
+}
+
+/*
+ * p m a p  l i f e c y c l e   f u n c t i o n s
+ */
+
+/*
+ * pmap_pdp_ctor: constructor for the PDP cache.
+ */
+
+int
+pmap_pdp_ctor(void *arg, void *object, int flags)
+{
+       pd_entry_t *pdir = object;
+       paddr_t pdirpa;
+
+       /*
+        * NOTE: The `pmap_lock' is held when the PDP is allocated.
+        * WE MUST NOT BLOCK!
+        */
+
+       /* fetch the physical address of the page directory. */
+       (void) pmap_extract(pmap_kernel(), (vaddr_t) pdir, &pdirpa);
+
+       XENPRINTF(("pmap_pdp_ctor %p %p\n", pdir, (void *)pdirpa));
+
+       /* zero init area */
+       memset(pdir, 0, PDSLOT_PTE * sizeof(pd_entry_t));
+
+       /* put in recursive PDE to map the PTEs */
+       pdir[PDSLOT_PTE] = xpmap_ptom(pdirpa | PG_V /* | PG_KW */);
+
+       /* put in kernel VM PDEs */
+       memcpy(&pdir[PDSLOT_KERN], &PDP_BASE[PDSLOT_KERN],
+           nkpde * sizeof(pd_entry_t));
+
+       /* zero the rest */
+       memset(&pdir[PDSLOT_KERN + nkpde], 0,
+           PAGE_SIZE - ((PDSLOT_KERN + nkpde) * sizeof(pd_entry_t)));
+
+       pmap_enter(pmap_kernel(), (vaddr_t)pdir, pdirpa, VM_PROT_READ,
+           VM_PROT_READ);
+       pmap_update(pmap_kernel());
+
+       /* pin page type */
+       xpq_queue_pin_table(xpmap_ptom(pdirpa), XPQ_PIN_L2_TABLE);
+       xpq_flush_queue();
+
+       return (0);
+}
+
+void
+pmap_pdp_dtor(void *arg, void *object)
+{
+       pd_entry_t *pdir = object;
+       paddr_t pdirpa;
+
+       /* fetch the physical address of the page directory. */
+       pdirpa = PDE_GET(&pdir[PDSLOT_PTE]) & PG_FRAME;
+
+       XENPRINTF(("pmap_pdp_dtor %p %p\n", pdir, (void *)pdirpa));
+
+       /* unpin page type */
+       xpq_queue_unpin_table(xpmap_ptom(pdirpa));
+       xpq_flush_queue();
+}
+
+/*
+ * pmap_create: create a pmap
+ *
+ * => note: old pmap interface took a "size" args which allowed for
+ *     the creation of "software only" pmaps (not in bsd).
+ */
+
+struct pmap *
+pmap_create()
+{
+       struct pmap *pmap;
+       u_int gen;
+
+       XENPRINTF(("pmap_create\n"));
+       pmap = pool_get(&pmap_pmap_pool, PR_WAITOK);
+
+       /* init uvm_object */
+       simple_lock_init(&pmap->pm_obj.vmobjlock);
+       pmap->pm_obj.pgops = NULL;      /* currently not a mappable object */
+       TAILQ_INIT(&pmap->pm_obj.memq);
+       pmap->pm_obj.uo_npages = 0;
+       pmap->pm_obj.uo_refs = 1;
+       pmap->pm_stats.wired_count = 0;
+       pmap->pm_stats.resident_count = 1;      /* count the PDP allocd below */
+       pmap->pm_ptphint = NULL;
+       pmap->pm_hiexec = 0;
+       pmap->pm_flags = 0;
+       pmap->pm_cpus = 0;
+
+       /* init the LDT */
+       pmap->pm_ldt = NULL;
+       pmap->pm_ldt_len = 0;
+       pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
+
+       /* allocate PDP */
+
+       /*
+        * we need to lock pmaps_lock to prevent nkpde from changing on
+        * us.  note that there is no need to splvm to protect us from
+        * malloc since malloc allocates out of a submap and we should
+        * have already allocated kernel PTPs to cover the range...
+        *
+        * NOTE: WE MUST NOT BLOCK WHILE HOLDING THE `pmap_lock', nor
+        * must we call pmap_growkernel() while holding it!
+        */
+
+ try_again:
+       gen = pmap_pdp_cache_generation;
+       pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
+
+       simple_lock(&pmaps_lock);
+
+       if (gen != pmap_pdp_cache_generation) {
+               simple_unlock(&pmaps_lock);
+               pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
+               goto try_again;
+       }
+
+       pmap->pm_pdirpa = PDE_GET(&pmap->pm_pdir[PDSLOT_PTE]) & PG_FRAME;
+       XENPRINTF(("pmap_create %p set pm_pdirpa %p/%p slotval %p\n", pmap,
+                  (void *)pmap->pm_pdirpa,
+                  (void *)xpmap_ptom(pmap->pm_pdirpa),
+                  (void *)pmap->pm_pdir[PDSLOT_PTE]));
+
+       LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
+
+       simple_unlock(&pmaps_lock);
+
+       return (pmap);
+}
+
+/*
+ * pmap_destroy: drop reference count on pmap.   free pmap if
+ *     reference count goes to zero.
+ */
+
+void
+pmap_destroy(pmap)
+       struct pmap *pmap;
+{
+       int refs;
+#ifdef DIAGNOSTIC
+       struct cpu_info *ci;
+       CPU_INFO_ITERATOR cii;
+#endif /* DIAGNOSTIC */
+
+       /*
+        * drop reference count
+        */
+
+       simple_lock(&pmap->pm_obj.vmobjlock);
+       refs = --pmap->pm_obj.uo_refs;
+       simple_unlock(&pmap->pm_obj.vmobjlock);
+       if (refs > 0) {
+               return;
+       }
+
+#ifdef DIAGNOSTIC
+       for (CPU_INFO_FOREACH(cii, ci))
+               if (ci->ci_pmap == pmap)
+                       panic("destroying pmap being used");
+#endif /* DIAGNOSTIC */
+
+       /*
+        * reference count is zero, free pmap resources and then free pmap.
+        */
+
+       XENPRINTF(("pmap_destroy %p pm_pdirpa %p/%p\n", pmap,
+                  (void *)pmap->pm_pdirpa,
+                  (void *)xpmap_ptom(pmap->pm_pdirpa)));
+
+       /*
+        * remove it from global list of pmaps
+        */
+
+       simple_lock(&pmaps_lock);
+       LIST_REMOVE(pmap, pm_list);
+       simple_unlock(&pmaps_lock);
+
+       /*
+        * destroyed pmap shouldn't have remaining PTPs
+        */
+
+       KASSERT(pmap->pm_obj.uo_npages == 0);
+       KASSERT(TAILQ_EMPTY(&pmap->pm_obj.memq));
+
+       /*
+        * MULTIPROCESSOR -- no need to flush out of other processors'
+        * APTE space because we do that in pmap_unmap_ptes().
+        */
+       pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
+
+#ifdef USER_LDT
+       if (pmap->pm_flags & PMF_USER_LDT) {
+               /*
+                * no need to switch the LDT; this address space is gone,
+                * nothing is using it.
+                *
+                * No need to lock the pmap for ldt_free (or anything else),
+                * we're the last one to use it.
+                */
+               ldt_free(pmap);
+               uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
+                           pmap->pm_ldt_len * sizeof(union descriptor));
+       }
+#endif
+
+       pool_put(&pmap_pmap_pool, pmap);
+}
+
+/*
+ *     Add a reference to the specified pmap.
+ */
+
+void
+pmap_reference(pmap)
+       struct pmap *pmap;
+{
+       simple_lock(&pmap->pm_obj.vmobjlock);
+       pmap->pm_obj.uo_refs++;
+       simple_unlock(&pmap->pm_obj.vmobjlock);
+}
+
+#if defined(PMAP_FORK)
+/*
+ * pmap_fork: perform any necessary data structure manipulation when
+ * a VM space is forked.
+ */
+
+void
+pmap_fork(pmap1, pmap2)
+       struct pmap *pmap1, *pmap2;
+{
+       simple_lock(&pmap1->pm_obj.vmobjlock);
+       simple_lock(&pmap2->pm_obj.vmobjlock);
+
+#ifdef USER_LDT
+       /* Copy the LDT, if necessary. */
+       if (pmap1->pm_flags & PMF_USER_LDT) {
+               union descriptor *new_ldt;
+               size_t len;
+
+               len = pmap1->pm_ldt_len * sizeof(union descriptor);
+               new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len);
+               memcpy(new_ldt, pmap1->pm_ldt, len);
+               pmap2->pm_ldt = new_ldt;
+               pmap2->pm_ldt_len = pmap1->pm_ldt_len;
+               pmap2->pm_flags |= PMF_USER_LDT;
+               ldt_alloc(pmap2, new_ldt, len);
+       }
+#endif /* USER_LDT */
+
+       simple_unlock(&pmap2->pm_obj.vmobjlock);
+       simple_unlock(&pmap1->pm_obj.vmobjlock);
+}
+#endif /* PMAP_FORK */
+
+#ifdef USER_LDT
+/*
+ * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
+ * restore the default.
+ */
+
+void
+pmap_ldt_cleanup(l)
+       struct lwp *l;
+{
+       struct pcb *pcb = &l->l_addr->u_pcb;
+       pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
+       union descriptor *old_ldt = NULL;
+       size_t len = 0;
+
+       simple_lock(&pmap->pm_obj.vmobjlock);
+
+       if (pmap->pm_flags & PMF_USER_LDT) {
+               ldt_free(pmap);
+               pmap->pm_ldt_sel = GSEL(GLDT_SEL, SEL_KPL);
+               pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
+               if (pcb == curpcb)
+                       lldt(pcb->pcb_ldt_sel);
+               old_ldt = pmap->pm_ldt;
+               len = pmap->pm_ldt_len * sizeof(union descriptor);
+               pmap->pm_ldt = NULL;
+               pmap->pm_ldt_len = 0;
+               pmap->pm_flags &= ~PMF_USER_LDT;
+       }
+
+       simple_unlock(&pmap->pm_obj.vmobjlock);
+
+       if (old_ldt != NULL)
+               uvm_km_free(kernel_map, (vaddr_t)old_ldt, len);
+}
+#endif /* USER_LDT */
+
+/*
+ * pmap_activate: activate a process' pmap
+ *
+ * => called from cpu_switch()
+ * => if lwp is the curlwp, then set ci_want_pmapload so that
+ *    actual MMU context switch will be done by pmap_load() later
+ */
+
+void
+pmap_activate(l)
+       struct lwp *l;
+{
+       struct cpu_info *ci = curcpu();
+       struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+
+       if (l == ci->ci_curlwp) {
+               struct pcb *pcb;
+
+               KASSERT(ci->ci_want_pmapload == 0);
+               KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
+#ifdef KSTACK_CHECK_DR0
+               /*
+                * setup breakpoint on the top of stack
+                */
+               if (l == &lwp0)
+                       dr0(0, 0, 0, 0);
+               else
+                       dr0(KSTACK_LOWEST_ADDR(l), 1, 3, 1);
+#endif
+
+               /*
+                * no need to switch to kernel vmspace because
+                * it's a subset of any vmspace.
+                */
+
+               if (pmap == pmap_kernel()) {
+                       ci->ci_want_pmapload = 0;
+                       return;
+               }
+
+               pcb = &l->l_addr->u_pcb;
+               pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
+
+               ci->ci_want_pmapload = 1;
+       }
+}
+
+/*
+ * pmap_reactivate: try to regain reference to the pmap.
+ */
+
+static boolean_t
+pmap_reactivate(struct pmap *pmap)
+{
+       struct cpu_info *ci = curcpu();
+       u_int32_t cpumask = 1U << ci->ci_cpuid;
+       int s;
+       boolean_t result;
+       u_int32_t oldcpus;
+
+       /*
+        * if we still have a lazy reference to this pmap,
+        * we can assume that there was no tlb shootdown
+        * for this pmap in the meantime.
+        */
+
+       s = splipi(); /* protect from tlb shootdown ipis. */
+       oldcpus = pmap->pm_cpus;
+       x86_atomic_setbits_l(&pmap->pm_cpus, cpumask);
+       if (oldcpus & cpumask) {
+               KASSERT(ci->ci_tlbstate == TLBSTATE_LAZY);
+               /* got it */
+               result = TRUE;
+       } else {
+               KASSERT(ci->ci_tlbstate == TLBSTATE_STALE);
+               result = FALSE;
+       }
+       ci->ci_tlbstate = TLBSTATE_VALID;
+       splx(s);
+
+       return result;
+}
+
+/*
+ * pmap_load: actually switch pmap.  (fill in %cr3 and LDT info)
+ */
+
+void
+pmap_load()
+{
+       struct cpu_info *ci = curcpu();
+       u_int32_t cpumask = 1U << ci->ci_cpuid;
+       struct pmap *pmap;
+       struct pmap *oldpmap;
+       struct lwp *l;
+       struct pcb *pcb;
+       pd_entry_t *mapdp;
+       int s;
+
+       KASSERT(ci->ci_want_pmapload);
+
+       l = ci->ci_curlwp;
+       KASSERT(l != NULL);
+       pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+       KASSERT(pmap != pmap_kernel());
+       oldpmap = ci->ci_pmap;
+
+       pcb = ci->ci_curpcb;
+       KASSERT(pcb == &l->l_addr->u_pcb);
+       /* loaded by pmap_activate */
+       KASSERT(pcb->pcb_ldt_sel == pmap->pm_ldt_sel);
+
+       if (pmap == oldpmap) {
+               if (!pmap_reactivate(pmap)) {
+
+                       /*
+                        * pmap has been changed during deactivated.
+                        * our tlb may be stale.
+                        */
+
+                       tlbflush();
+               }
+
+               ci->ci_want_pmapload = 0;
+               return;
+       }
+
+       /*
+        * actually switch pmap.
+        */
+
+       x86_atomic_clearbits_l(&oldpmap->pm_cpus, cpumask);
+
+       KASSERT((pmap->pm_cpus & cpumask) == 0);
+
+       KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE);
+       pmap_reference(pmap);
+       KERNEL_UNLOCK();
+
+       /*
+        * mark the pmap in use by this processor.
+        */
+
+       s = splipi();
+       x86_atomic_setbits_l(&pmap->pm_cpus, cpumask);
+       ci->ci_pmap = pmap;
+       ci->ci_tlbstate = TLBSTATE_VALID;
+       splx(s);
+
+       /*
+        * clear apdp slot before loading %cr3 since Xen only allows
+        * linear pagetable mappings in the current pagetable.
+        */
+       KDASSERT(curapdp == 0);
+       mapdp = (pt_entry_t *)vtomach((vaddr_t)APDP_PDE);
+       PDE_CLEAR(APDP_PDE, mapdp);
+
+       /*
+        * update tss and load corresponding registers.
+        */
+
+       lldt(pcb->pcb_ldt_sel);
+       pcb->pcb_cr3 = pmap->pm_pdirpa;
+       lcr3(pcb->pcb_cr3);
+
+       ci->ci_want_pmapload = 0;
+
+       KERNEL_LOCK(LK_EXCLUSIVE | LK_CANRECURSE);
+       pmap_destroy(oldpmap);
+       KERNEL_UNLOCK();
+}
+
+/*
+ * pmap_deactivate: deactivate a process' pmap
+ */
+
+void
+pmap_deactivate(l)
+       struct lwp *l;
+{
+
+       if (l == curlwp)
+               pmap_deactivate2(l);
+}
+
+/*
+ * pmap_deactivate2: context switch version of pmap_deactivate.
+ * always treat l as curlwp.
+ */
+
+void
+pmap_deactivate2(l)
+       struct lwp *l;
+{
+       struct pmap *pmap;
+       struct cpu_info *ci = curcpu();
+
+       if (ci->ci_want_pmapload) {
+               KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
+                   != pmap_kernel());
+               KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
+                   != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
+
+               /*
+                * userspace has not been touched.
+                * nothing to do here.
+                */
+
+               ci->ci_want_pmapload = 0;
+               return;
+       }
+
+       pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+
+       if (pmap == pmap_kernel()) {
+               return;
+       }
+
+       KASSERT(ci->ci_pmap == pmap);
+
+       KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
+       ci->ci_tlbstate = TLBSTATE_LAZY;
+       XENPRINTF(("pmap_deactivate %p ebp %p esp %p\n",
+                     l, (void *)l->l_addr->u_pcb.pcb_ebp, 
+                     (void *)l->l_addr->u_pcb.pcb_esp));
+}
+
+/*
+ * end of lifecycle functions
+ */
+
+/*
+ * some misc. functions
+ */
+
+/*
+ * pmap_extract: extract a PA for the given VA
+ */
+
+boolean_t
+pmap_extract(pmap, va, pap)
+       struct pmap *pmap;
+       vaddr_t va;
+       paddr_t *pap;
+{
+       pt_entry_t *ptes, pte;
+       pd_entry_t pde;
+
+       if (__predict_true((pde = PDE_GET(&pmap->pm_pdir[pdei(va)])) != 0)) {
+#ifdef LARGEPAGES
+               if (pde & PG_PS) {
+                       if (pap != NULL)
+                               *pap = (pde & PG_LGFRAME) | (va & ~PG_LGFRAME);
+                       return (TRUE);
+               }
+#endif
+
+               ptes = pmap_map_ptes(pmap);
+               pte = PTE_GET(&ptes[x86_btop(va)]);
+               pmap_unmap_ptes(pmap);
+
+               if (__predict_true((pte & PG_V) != 0)) {
+                       if (pap != NULL)
+                               *pap = (pte & PG_FRAME) | (va & ~PG_FRAME);
+                       return (TRUE);
+               }
+       }
+       return (FALSE);
+}
+
+
+/*
+ * vtophys: virtual address to physical address.  For use by
+ * machine-dependent code only.
+ */
+
+paddr_t
+vtophys(va)
+       vaddr_t va;
+{
+       paddr_t pa;
+
+       if (pmap_extract(pmap_kernel(), va, &pa) == TRUE)
+               return (pa);
+       return (0);
+}
+
+
+/*
+ * pmap_virtual_space: used during bootup [pmap_steal_memory] to
+ *     determine the bounds of the kernel virtual addess space.
+ */
+
+void
+pmap_virtual_space(startp, endp)
+       vaddr_t *startp;
+       vaddr_t *endp;
+{
+       *startp = virtual_avail;
+       *endp = virtual_end;
+}
+
+/*
+ * pmap_map: map a range of PAs into kvm
+ *
+ * => used during crash dump
+ * => XXX: pmap_map() should be phased out?
+ */
+
+vaddr_t
+pmap_map(va, spa, epa, prot)
+       vaddr_t va;
+       paddr_t spa, epa;
+       vm_prot_t prot;
+{
+       while (spa < epa) {
+               pmap_enter(pmap_kernel(), va, spa, prot, 0);
+               va += PAGE_SIZE;
+               spa += PAGE_SIZE;
+       }
+       pmap_update(pmap_kernel());
+       return va;
+}
+
+/*
+ * pmap_zero_page: zero a page
+ */
+
+void
+pmap_zero_page(pa)
+       paddr_t pa;
+{
+#ifdef MULTIPROCESSOR
+       int id = cpu_number();
+#endif
+       pt_entry_t *zpte = PTESLEW(zero_pte, id);
+       pt_entry_t *maptp;
+       caddr_t zerova = VASLEW(zerop, id);
+
+#ifdef DIAGNOSTIC
+       if (PTE_GET(zpte))
+               panic("pmap_zero_page: lock botch");
+#endif
+
+       maptp = (pt_entry_t *)vtomach((vaddr_t)zpte);
+       PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW);   /* map in */
+       pmap_update_pg((vaddr_t)zerova);                /* flush TLB */
+
+       memset(zerova, 0, PAGE_SIZE);                   /* zero */
+       PTE_CLEAR(zpte, maptp);                         /* zap! */
+}
+
+/*
+ * pmap_pagezeroidle: the same, for the idle loop page zero'er.
+ * Returns TRUE if the page was zero'd, FALSE if we aborted for
+ * some reason.
+ */
+
+boolean_t
+pmap_pageidlezero(pa)
+       paddr_t pa;
+{
+#ifdef MULTIPROCESSOR
+       int id = cpu_number();
+#endif
+       pt_entry_t *zpte = PTESLEW(zero_pte, id);
+       pt_entry_t *maptp;
+       caddr_t zerova = VASLEW(zerop, id);
+       boolean_t rv = TRUE;
+       int i, *ptr;
+
+#ifdef DIAGNOSTIC
+       if (PTE_GET(zpte))
+               panic("pmap_zero_page_uncached: lock botch");
+#endif
+       maptp = (pt_entry_t *)vtomach((vaddr_t)zpte);
+       PTE_SET(zpte, maptp, (pa & PG_FRAME) | PG_V | PG_RW);   /* map in */
+       pmap_update_pg((vaddr_t)zerova);                /* flush TLB */
+       for (i = 0, ptr = (int *) zerova; i < PAGE_SIZE / sizeof(int); i++) {
+               if (sched_whichqs != 0) {
+
+                       /*
+                        * A process has become ready.  Abort now,
+                        * so we don't keep it waiting while we
+                        * do slow memory access to finish this
+                        * page.
+                        */
+
+                       rv = FALSE;
+                       break;
+               }
+               *ptr++ = 0;
+       }
+
+       PTE_CLEAR(zpte, maptp);                         /* zap! */
+       return (rv);
+}
+
+/*
+ * pmap_copy_page: copy a page
+ */
+
+void
+pmap_copy_page(srcpa, dstpa)
+       paddr_t srcpa, dstpa;
+{
+#ifdef MULTIPROCESSOR
+       int id = cpu_number();
+#endif
+       pt_entry_t *spte = PTESLEW(csrc_pte,id), *maspte;
+       pt_entry_t *dpte = PTESLEW(cdst_pte,id), *madpte;
+       caddr_t csrcva = VASLEW(csrcp, id);
+       caddr_t cdstva = VASLEW(cdstp, id);
+
+#ifdef DIAGNOSTIC
+       if (PTE_GET(spte) || PTE_GET(dpte))
+               panic("pmap_copy_page: lock botch");
+#endif
+
+       maspte = (pt_entry_t *)vtomach((vaddr_t)spte);
+       madpte = (pt_entry_t *)vtomach((vaddr_t)dpte);
+       PTE_SET(spte, maspte, (srcpa & PG_FRAME) | PG_V | PG_RW);
+       PTE_SET(dpte, madpte, (dstpa & PG_FRAME) | PG_V | PG_RW);
+       pmap_update_2pg((vaddr_t)csrcva, (vaddr_t)cdstva);
+       memcpy(cdstva, csrcva, PAGE_SIZE);
+       PTE_CLEAR(spte, maspte);                        /* zap! */
+       PTE_CLEAR(dpte, madpte);                        /* zap! */
+}
+
+/*
+ * p m a p   r e m o v e   f u n c t i o n s
+ *
+ * functions that remove mappings
+ */
+
+/*
+ * pmap_remove_ptes: remove PTEs from a PTP
+ *
+ * => must have proper locking on pmap_master_lock
+ * => caller must hold pmap's lock
+ * => PTP must be mapped into KVA
+ * => PTP should be null if pmap == pmap_kernel()
+ */
+
+static void
+pmap_remove_ptes(pmap, ptp, ptpva, startva, endva, cpumaskp, flags)
+       struct pmap *pmap;
+       struct vm_page *ptp;
+       vaddr_t ptpva;
+       vaddr_t startva, endva;
+       int32_t *cpumaskp;
+       int flags;
+{
+       struct pv_entry *pv_tofree = NULL;      /* list of pv_entrys to free */
+       struct pv_entry *pve;
+       pt_entry_t *pte = (pt_entry_t *) ptpva;
+       pt_entry_t opte;
+       pt_entry_t *maptp;
+
+       /*
+        * note that ptpva points to the PTE that maps startva.   this may
+        * or may not be the first PTE in the PTP.
+        *
+        * we loop through the PTP while there are still PTEs to look at
+        * and the wire_count is greater than 1 (because we use the wire_count
+        * to keep track of the number of real PTEs in the PTP).
+        */
+
+       for (/*null*/; startva < endva && (ptp == NULL || ptp->wire_count > 1)
+                            ; pte++, startva += PAGE_SIZE) {
+               struct vm_page *pg;
+               struct vm_page_md *mdpg;
+
+               if (!pmap_valid_entry(*pte))
+                       continue;                       /* VA not mapped */
+               if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
+                       continue;
+               }
+
+               /* atomically save the old PTE and zap! it */
+               maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+               opte = pte_atomic_update(pte, maptp, 0);
+               pmap_exec_account(pmap, startva, opte, 0);
+
+               if (opte & PG_W)
+                       pmap->pm_stats.wired_count--;
+               pmap->pm_stats.resident_count--;
+
+               if (opte & PG_U)
+                       pmap_tlb_shootdown(pmap, startva, opte, cpumaskp);
+
+               if (ptp) {
+                       ptp->wire_count--;              /* dropping a PTE */
+                       /* Make sure that the PDE is flushed */
+                       if ((ptp->wire_count <= 1) && !(opte & PG_U))
+                               pmap_tlb_shootdown(pmap, startva, opte,
+                                   cpumaskp);
+               }
+
+               /*
+                * if we are not on a pv_head list we are done.
+                */
+
+               if ((opte & PG_PVLIST) == 0) {
+#if defined(DIAGNOSTIC) && !defined(DOM0OPS)
+                       if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL)
+                               panic("pmap_remove_ptes: managed page without "
+                                     "PG_PVLIST for 0x%lx", startva);
+#endif
+                       continue;
+               }
+
+               pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+#ifdef DIAGNOSTIC
+               if (pg == NULL)
+                       panic("pmap_remove_ptes: unmanaged page marked "
+                             "PG_PVLIST, va = 0x%lx, pa = 0x%lx",
+                             startva, (u_long)(opte & PG_FRAME));
+#endif
+               mdpg = &pg->mdpage;
+
+               /* sync R/M bits */
+               simple_lock(&mdpg->mp_pvhead.pvh_lock);
+               mdpg->mp_attrs |= (opte & (PG_U|PG_M));
+               pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, startva);
+               simple_unlock(&mdpg->mp_pvhead.pvh_lock);
+
+               if (pve) {
+                       SPLAY_RIGHT(pve, pv_node) = pv_tofree;
+                       pv_tofree = pve;
+               }
+
+               /* end of "for" loop: time for next pte */
+       }
+       if (pv_tofree)
+               pmap_free_pvs(pmap, pv_tofree);
+}
+
+
+/*
+ * pmap_remove_pte: remove a single PTE from a PTP
+ *
+ * => must have proper locking on pmap_master_lock
+ * => caller must hold pmap's lock
+ * => PTP must be mapped into KVA
+ * => PTP should be null if pmap == pmap_kernel()
+ * => returns true if we removed a mapping
+ */
+
+static boolean_t
+pmap_remove_pte(pmap, ptp, pte, va, cpumaskp, flags)
+       struct pmap *pmap;
+       struct vm_page *ptp;
+       pt_entry_t *pte;
+       vaddr_t va;
+       int32_t *cpumaskp;
+       int flags;
+{
+       pt_entry_t opte;
+       pt_entry_t *maptp;
+       struct pv_entry *pve;
+       struct vm_page *pg;
+       struct vm_page_md *mdpg;
+
+       if (!pmap_valid_entry(*pte))
+               return(FALSE);          /* VA not mapped */
+       if ((flags & PMAP_REMOVE_SKIPWIRED) && (*pte & PG_W)) {
+               return(FALSE);
+       }
+
+       /* atomically save the old PTE and zap! it */
+       maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+       opte = pte_atomic_update(pte, maptp, 0);
+
+       XENPRINTK(("pmap_remove_pte %p, was %08x\n", pte, opte));
+       pmap_exec_account(pmap, va, opte, 0);
+
+       if (opte & PG_W)
+               pmap->pm_stats.wired_count--;
+       pmap->pm_stats.resident_count--;
+
+       if (opte & PG_U)
+               pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
+
+       if (ptp) {
+               ptp->wire_count--;              /* dropping a PTE */
+               /* Make sure that the PDE is flushed */
+               if ((ptp->wire_count <= 1) && !(opte & PG_U))
+                       pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
+
+       }
+       /*
+        * if we are not on a pv_head list we are done.
+        */
+
+       if ((opte & PG_PVLIST) == 0) {
+#if defined(DIAGNOSTIC) && !defined(DOM0OPS)
+               if (PHYS_TO_VM_PAGE(opte & PG_FRAME) != NULL)
+                       panic("pmap_remove_pte: managed page without "
+                             "PG_PVLIST for 0x%lx", va);
+#endif
+               return(TRUE);
+       }
+
+       pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+#ifdef DIAGNOSTIC
+       if (pg == NULL)
+               panic("pmap_remove_pte: unmanaged page marked "
+                   "PG_PVLIST, va = 0x%lx, pa = 0x%lx", va,
+                   (u_long)(opte & PG_FRAME));
+#endif
+       mdpg = &pg->mdpage;
+
+       /* sync R/M bits */
+       simple_lock(&mdpg->mp_pvhead.pvh_lock);
+       mdpg->mp_attrs |= (opte & (PG_U|PG_M));
+       pve = pmap_remove_pv(&mdpg->mp_pvhead, pmap, va);
+       simple_unlock(&mdpg->mp_pvhead.pvh_lock);
+
+       if (pve)
+               pmap_free_pv(pmap, pve);
+       return(TRUE);
+}
+
+/*
+ * pmap_remove: top level mapping removal function
+ *
+ * => caller should not be holding any pmap locks
+ */
+
+void
+pmap_remove(pmap, sva, eva)
+       struct pmap *pmap;
+       vaddr_t sva, eva;
+{
+       pmap_do_remove(pmap, sva, eva, PMAP_REMOVE_ALL);
+}
+
+/*
+ * pmap_do_remove: mapping removal guts
+ *
+ * => caller should not be holding any pmap locks
+ */
+
+static void
+pmap_do_remove(pmap, sva, eva, flags)
+       struct pmap *pmap;
+       vaddr_t sva, eva;
+       int flags;
+{
+       pt_entry_t *ptes, opte;
+       pt_entry_t *maptp;
+       boolean_t result;
+       paddr_t ptppa;
+       vaddr_t blkendva;
+       struct vm_page *ptp;
+       int32_t cpumask = 0;
+       TAILQ_HEAD(, vm_page) empty_ptps;
+       struct cpu_info *ci;
+       struct pmap *curpmap;
+
+       /*
+        * we lock in the pmap => pv_head direction
+        */
+
+       TAILQ_INIT(&empty_ptps);
+
+       PMAP_MAP_TO_HEAD_LOCK();
+
+       ptes = pmap_map_ptes(pmap);     /* locks pmap */
+
+       ci = curcpu();
+       curpmap = ci->ci_pmap;
+
+       /*
+        * removing one page?  take shortcut function.
+        */
+
+       if (sva + PAGE_SIZE == eva) {
+               if (pmap_valid_entry(pmap->pm_pdir[pdei(sva)])) {
+
+                       /* PA of the PTP */
+                       ptppa = PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME;
+
+                       /* get PTP if non-kernel mapping */
+                       if (pmap == pmap_kernel()) {
+                               /* we never free kernel PTPs */
+                               ptp = NULL;
+                       } else {
+                               if (pmap->pm_ptphint &&
+                                   VM_PAGE_TO_PHYS(pmap->pm_ptphint) ==
+                                   ptppa) {
+                                       ptp = pmap->pm_ptphint;
+                               } else {
+                                       ptp = PHYS_TO_VM_PAGE(ptppa);
+#ifdef DIAGNOSTIC
+                                       if (ptp == NULL)
+                                               panic("pmap_remove: unmanaged "
+                                                     "PTP detected");
+#endif
+                               }
+                       }
+
+                       /* do it! */
+                       result = pmap_remove_pte(pmap, ptp,
+                           &ptes[x86_btop(sva)], sva, &cpumask, flags);
+
+                       /*
+                        * if mapping removed and the PTP is no longer
+                        * being used, free it!
+                        */
+
+                       if (result && ptp && ptp->wire_count <= 1) {
+                               /* zap! */
+                               maptp = (pt_entry_t *)vtomach(
+                                       (vaddr_t)&pmap->pm_pdir[pdei(sva)]);
+                               PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)],
+                                   maptp, opte);
+#if defined(MULTIPROCESSOR)
+                               /*
+                                * XXXthorpej Redundant shootdown can happen
+                                * here if we're using APTE space.
+                                */
+#endif
+                               pmap_tlb_shootdown(curpmap,
+                                   ((vaddr_t)ptes) + ptp->offset, opte,
+                                   &cpumask);
+#if defined(MULTIPROCESSOR)
+                               /*
+                                * Always shoot down the pmap's self-mapping
+                                * of the PTP.
+                                * XXXthorpej Redundant shootdown can happen
+                                * here if pmap == curpmap (not APTE space).
+                                */
+                               pmap_tlb_shootdown(pmap,
+                                   ((vaddr_t)PTE_BASE) + ptp->offset, opte,
+                                   &cpumask);
+#endif
+                               pmap->pm_stats.resident_count--;
+                               if (pmap->pm_ptphint == ptp)
+                                       pmap->pm_ptphint =
+                                           TAILQ_FIRST(&pmap->pm_obj.memq);
+                               ptp->wire_count = 0;
+                               ptp->flags |= PG_ZERO;
+                               uvm_pagerealloc(ptp, NULL, 0);
+                               TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq);
+                       }
+               }
+               pmap_tlb_shootnow(cpumask);
+               pmap_unmap_ptes(pmap);          /* unlock pmap */
+               PMAP_MAP_TO_HEAD_UNLOCK();
+               /* Now we can free unused ptps */
+               TAILQ_FOREACH(ptp, &empty_ptps, listq)
+                       uvm_pagefree(ptp);
+               return;
+       }
+
+       cpumask = 0;
+
+       for (/* null */ ; sva < eva ; sva = blkendva) {
+
+               /* determine range of block */
+               blkendva = x86_round_pdr(sva+1);
+               if (blkendva > eva)
+                       blkendva = eva;
+
+               /*
+                * XXXCDC: our PTE mappings should never be removed
+                * with pmap_remove!  if we allow this (and why would
+                * we?) then we end up freeing the pmap's page
+                * directory page (PDP) before we are finished using
+                * it when we hit in in the recursive mapping.  this
+                * is BAD.
+                *
+                * long term solution is to move the PTEs out of user
+                * address space.  and into kernel address space (up
+                * with APTE).  then we can set VM_MAXUSER_ADDRESS to
+                * be VM_MAX_ADDRESS.
+                */
+
+               if (pdei(sva) == PDSLOT_PTE)
+                       /* XXXCDC: ugly hack to avoid freeing PDP here */
+                       continue;
+
+               if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
+                       /* valid block? */
+                       continue;
+
+               /* PA of the PTP */
+               ptppa = (PDE_GET(&pmap->pm_pdir[pdei(sva)]) & PG_FRAME);
+
+               /* get PTP if non-kernel mapping */
+               if (pmap == pmap_kernel()) {
+                       /* we never free kernel PTPs */
+                       ptp = NULL;
+               } else {
+                       if (pmap->pm_ptphint &&
+                           VM_PAGE_TO_PHYS(pmap->pm_ptphint) == ptppa) {
+                               ptp = pmap->pm_ptphint;
+                       } else {
+                               ptp = PHYS_TO_VM_PAGE(ptppa);
+#ifdef DIAGNOSTIC
+                               if (ptp == NULL)
+                                       panic("pmap_remove: unmanaged PTP "
+                                             "detected");
+#endif
+                       }
+               }
+               pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[x86_btop(sva)],
+                   sva, blkendva, &cpumask, flags);
+
+               /* if PTP is no longer being used, free it! */
+               if (ptp && ptp->wire_count <= 1) {
+                       /* zap! */
+                       maptp = (pt_entry_t *)vtomach(
+                               (vaddr_t)&pmap->pm_pdir[pdei(sva)]);
+                       PTE_ATOMIC_CLEAR(&pmap->pm_pdir[pdei(sva)],
+                           maptp, opte);
+#if defined(MULTIPROCESSOR)
+                       /*
+                        * XXXthorpej Redundant shootdown can happen here
+                        * if we're using APTE space.
+                        */
+#endif
+                       pmap_tlb_shootdown(curpmap,
+                           ((vaddr_t)ptes) + ptp->offset, opte, &cpumask);
+#if defined(MULTIPROCESSOR)
+                       /*
+                        * Always shoot down the pmap's self-mapping
+                        * of the PTP.
+                        * XXXthorpej Redundant shootdown can happen here
+                        * if pmap == curpmap (not APTE space).
+                        */
+                       pmap_tlb_shootdown(pmap,
+                           ((vaddr_t)PTE_BASE) + ptp->offset, opte, &cpumask);
+#endif
+                       pmap->pm_stats.resident_count--;
+                       if (pmap->pm_ptphint == ptp)    /* update hint? */
+                               pmap->pm_ptphint = pmap->pm_obj.memq.tqh_first;
+                       ptp->wire_count = 0;
+                       ptp->flags |= PG_ZERO;
+                       /* Postpone free to shootdown */
+                       uvm_pagerealloc(ptp, NULL, 0);
+                       TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq);
+               }
+       }
+
+       pmap_tlb_shootnow(cpumask);
+       pmap_unmap_ptes(pmap);
+       PMAP_MAP_TO_HEAD_UNLOCK();
+       /* Now we can free unused ptps */
+       TAILQ_FOREACH(ptp, &empty_ptps, listq)
+               uvm_pagefree(ptp);
+}
+
+/*
+ * pmap_page_remove: remove a managed vm_page from all pmaps that map it
+ *
+ * => we set pv_head => pmap locking
+ * => R/M bits are sync'd back to attrs
+ */
+
+void
+pmap_page_remove(pg)
+       struct vm_page *pg;
+{
+       struct pv_head *pvh;
+       struct pv_entry *pve, *npve, *killlist = NULL;
+       pt_entry_t *ptes, opte;
+       pt_entry_t *maptp;
+       int32_t cpumask = 0;
+       TAILQ_HEAD(, vm_page) empty_ptps;
+       struct vm_page *ptp;
+       struct cpu_info *ci;
+       struct pmap *curpmap;
+
+#ifdef DIAGNOSTIC
+       int bank, off;
+
+       bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
+       if (bank == -1)
+               panic("pmap_page_remove: unmanaged page?");
+#endif
+
+       pvh = &pg->mdpage.mp_pvhead;
+       if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
+               return;
+       }
+
+       TAILQ_INIT(&empty_ptps);
+
+       /* set pv_head => pmap locking */
+       PMAP_HEAD_TO_MAP_LOCK();
+
+       ci = curcpu();
+       curpmap = ci->ci_pmap;
+
+       /* XXX: needed if we hold head->map lock? */
+       simple_lock(&pvh->pvh_lock);
+
+       for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root); pve != NULL; pve = npve) {
+               npve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve);
+               ptes = pmap_map_ptes(pve->pv_pmap);             /* locks pmap */
+
+#ifdef DIAGNOSTIC
+               if (pve->pv_ptp &&
+                   (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]) &
+                       PG_FRAME) != VM_PAGE_TO_PHYS(pve->pv_ptp)) {
+                       printf("pmap_page_remove: pg=%p: va=%lx, pv_ptp=%p\n",
+                           pg, pve->pv_va, pve->pv_ptp);
+                       printf("pmap_page_remove: PTP's phys addr: "
+                           "actual=%lx, recorded=%lx\n",
+                           (PDE_GET(&pve->pv_pmap->pm_pdir[pdei(pve->pv_va)])
+                               & PG_FRAME), VM_PAGE_TO_PHYS(pve->pv_ptp));
+                       panic("pmap_page_remove: mapped managed page has "
+                           "invalid pv_ptp field");
+               }
+#endif
+
+               /* atomically save the old PTE and zap! it */
+               maptp = (pt_entry_t *)vtomach(
+                       (vaddr_t)&ptes[x86_btop(pve->pv_va)]);
+               opte = pte_atomic_update(&ptes[x86_btop(pve->pv_va)],
+                   maptp, 0);
+
+               if (opte & PG_W)
+                       pve->pv_pmap->pm_stats.wired_count--;
+               pve->pv_pmap->pm_stats.resident_count--;
+
+               /* Shootdown only if referenced */
+               if (opte & PG_U)
+                       pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
+                           &cpumask);
+
+               /* sync R/M bits */
+               pg->mdpage.mp_attrs |= (opte & (PG_U|PG_M));
+
+               /* update the PTP reference count.  free if last reference. */
+               if (pve->pv_ptp) {
+                       pve->pv_ptp->wire_count--;
+                       if (pve->pv_ptp->wire_count <= 1) {
+                               /*
+                                * Do we have to shootdown the page just to
+                                * get the pte out of the TLB ?
+                                */
+                               if(!(opte & PG_U))
+                                       pmap_tlb_shootdown(pve->pv_pmap,
+                                           pve->pv_va, opte, &cpumask);
+
+                               /* zap! */
+                               maptp = (pt_entry_t *)vtomach((vaddr_t)
+                                   &pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]);
+                               PTE_ATOMIC_CLEAR(&pve->pv_pmap->pm_pdir
+                                   [pdei(pve->pv_va)], maptp, opte);
+                               pmap_tlb_shootdown(curpmap,
+                                   ((vaddr_t)ptes) + pve->pv_ptp->offset,
+                                   opte, &cpumask);
+#if defined(MULTIPROCESSOR)
+                               /*
+                                * Always shoot down the other pmap's
+                                * self-mapping of the PTP.
+                                */
+                               pmap_tlb_shootdown(pve->pv_pmap,
+                                   ((vaddr_t)PTE_BASE) + pve->pv_ptp->offset,
+                                   opte, &cpumask);
+#endif
+                               pve->pv_pmap->pm_stats.resident_count--;
+                               /* update hint? */
+                               if (pve->pv_pmap->pm_ptphint == pve->pv_ptp)
+                                       pve->pv_pmap->pm_ptphint =
+                                           pve->pv_pmap->pm_obj.memq.tqh_first;
+                               pve->pv_ptp->wire_count = 0;
+                               pve->pv_ptp->flags |= PG_ZERO;
+                               /* Free only after the shootdown */
+                               uvm_pagerealloc(pve->pv_ptp, NULL, 0);
+                               TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp,
+                                   listq);
+                       }
+               }
+               pmap_unmap_ptes(pve->pv_pmap);          /* unlocks pmap */
+               SPLAY_REMOVE(pvtree, &pvh->pvh_root, pve); /* remove it */
+               SPLAY_RIGHT(pve, pv_node) = killlist;   /* mark it for death */
+               killlist = pve;
+       }
+       pmap_free_pvs(NULL, killlist);
+       simple_unlock(&pvh->pvh_lock);
+       PMAP_HEAD_TO_MAP_UNLOCK();
+       pmap_tlb_shootnow(cpumask);
+
+       /* Now we can free unused ptps */
+       TAILQ_FOREACH(ptp, &empty_ptps, listq)
+               uvm_pagefree(ptp);
+}
+
+/*
+ * p m a p   a t t r i b u t e  f u n c t i o n s
+ * functions that test/change managed page's attributes
+ * since a page can be mapped multiple times we must check each PTE that
+ * maps it by going down the pv lists.
+ */
+
+/*
+ * pmap_test_attrs: test a page's attributes
+ *
+ * => we set pv_head => pmap locking
+ */
+
+boolean_t
+pmap_test_attrs(pg, testbits)
+       struct vm_page *pg;
+       int testbits;
+{
+       struct vm_page_md *mdpg;
+       int *myattrs;
+       struct pv_head *pvh;
+       struct pv_entry *pve;
+       volatile pt_entry_t *ptes;
+       pt_entry_t pte;
+
+#if DIAGNOSTIC
+       int bank, off;
+
+       bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
+       if (bank == -1)
+               panic("pmap_test_attrs: unmanaged page?");
+#endif
+       mdpg = &pg->mdpage;
+
+       /*
+        * before locking: see if attributes are already set and if so,
+        * return!
+        */
+
+       myattrs = &mdpg->mp_attrs;
+       if (*myattrs & testbits)
+               return(TRUE);
+
+       /* test to see if there is a list before bothering to lock */
+       pvh = &mdpg->mp_pvhead;
+       if (SPLAY_ROOT(&pvh->pvh_root) == NULL) {
+               return(FALSE);
+       }
+
+       /* nope, gonna have to do it the hard way */
+       PMAP_HEAD_TO_MAP_LOCK();
+       /* XXX: needed if we hold head->map lock? */
+       simple_lock(&pvh->pvh_lock);
+
+       for (pve = SPLAY_MIN(pvtree, &pvh->pvh_root);
+            pve != NULL && (*myattrs & testbits) == 0;
+            pve = SPLAY_NEXT(pvtree, &pvh->pvh_root, pve)) {
+               ptes = pmap_map_ptes(pve->pv_pmap);
+               pte = PTE_GET(&ptes[x86_btop(pve->pv_va)]); /* XXX flags only? */
+               pmap_unmap_ptes(pve->pv_pmap);
+               *myattrs |= pte;
+       }
+
+       /*
+        * note that we will exit the for loop with a non-null pve if
+        * we have found the bits we are testing for.
+        */
+
+       simple_unlock(&pvh->pvh_lock);
+       PMAP_HEAD_TO_MAP_UNLOCK();
+       return((*myattrs & testbits) != 0);
+}
+
+/*
+ * pmap_clear_attrs: clear the specified attribute for a page.
+ *
+ * => we set pv_head => pmap locking
+ * => we return TRUE if we cleared one of the bits we were asked to
+ */
+
+boolean_t
+pmap_clear_attrs(pg, clearbits)
+       struct vm_page *pg;
+       int clearbits;
+{
+       struct vm_page_md *mdpg;
+       u_int32_t result;
+       struct pv_head *pvh;
+       struct pv_entry *pve;
+       pt_entry_t *ptes, opte;
+       pt_entry_t *maptp;
+       int *myattrs;
+       int32_t cpumask = 0;
+
+#ifdef DIAGNOSTIC
+       int bank, off;
+
+       bank = vm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), &off);
+       if (bank == -1)
+               panic("pmap_change_attrs: unmanaged page?");
+#endif
+       mdpg = &pg->mdpage;
+
+       PMAP_HEAD_TO_MAP_LOCK();
+       pvh = &mdpg->mp_pvhead;
+       /* XXX: needed if we hold head->map lock? */
+       simple_lock(&pvh->pvh_lock);
+
+       myattrs = &mdpg->mp_attrs;
+       result = *myattrs & clearbits;
+       *myattrs &= ~clearbits;
+
+       SPLAY_FOREACH(pve, pvtree, &pvh->pvh_root) {
+#ifdef DIAGNOSTIC
+               if (!pmap_valid_entry(pve->pv_pmap->pm_pdir[pdei(pve->pv_va)]))
+                       panic("pmap_change_attrs: mapping without PTP "
+                             "detected");
+#endif
+
+               ptes = pmap_map_ptes(pve->pv_pmap);     /* locks pmap */
+               opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]);
+               if (opte & clearbits) {
+                       /* We need to do something */
+                       if (clearbits == PG_RW) {
+                               result |= PG_RW;
+
+                               /*
+                                * On write protect we might not need to flush 
+                                * the TLB
+                                */
+
+                               /* First zap the RW bit! */
+                               maptp = (pt_entry_t *)vtomach(
+                                       (vaddr_t)&ptes[x86_btop(pve->pv_va)]);
+                               PTE_ATOMIC_CLEARBITS(
+                                       &ptes[x86_btop(pve->pv_va)],
+                                       maptp, PG_RW);
+                               opte = PTE_GET(&ptes[x86_btop(pve->pv_va)]);
+
+                               /*
+                                * Then test if it is not cached as RW the TLB
+                                */
+                               if (!(opte & PG_M))
+                                       goto no_tlb_shootdown;
+                       }
+
+                       /*
+                        * Since we need a shootdown me might as well
+                        * always clear PG_U AND PG_M.
+                        */
+
+                       /* zap! */
+                       maptp = (pt_entry_t *)vtomach(
+                               (vaddr_t)&ptes[x86_btop(pve->pv_va)]);
+                       PTE_ATOMIC_SET(&ptes[x86_btop(pve->pv_va)], maptp,
+                           (opte & ~(PG_U | PG_M)), opte);
+
+                       result |= (opte & clearbits);
+                       *myattrs |= (opte & ~(clearbits));
+
+                       pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
+                                          &cpumask);
+               }
+no_tlb_shootdown:
+               pmap_unmap_ptes(pve->pv_pmap);          /* unlocks pmap */
+       }
+
+       simple_unlock(&pvh->pvh_lock);
+       PMAP_HEAD_TO_MAP_UNLOCK();
+
+       pmap_tlb_shootnow(cpumask);
+       return(result != 0);
+}
+
+
+/*
+ * p m a p   p r o t e c t i o n   f u n c t i o n s
+ */
+
+/*
+ * pmap_page_protect: change the protection of all recorded mappings
+ *     of a managed page
+ *
+ * => NOTE: this is an inline function in pmap.h
+ */
+
+/* see pmap.h */
+
+/*
+ * pmap_protect: set the protection in of the pages in a pmap
+ *
+ * => NOTE: this is an inline function in pmap.h
+ */
+
+/* see pmap.h */
+
+/*
+ * pmap_write_protect: write-protect pages in a pmap
+ */
+
+void
+pmap_write_protect(pmap, sva, eva, prot)
+       struct pmap *pmap;
+       vaddr_t sva, eva;
+       vm_prot_t prot;
+{
+       pt_entry_t *ptes, *epte;
+       pt_entry_t *maptp;
+#ifndef XEN
+       volatile
+#endif
+               pt_entry_t *spte;
+       vaddr_t blockend;
+       int32_t cpumask = 0;
+
+       ptes = pmap_map_ptes(pmap);             /* locks pmap */
+
+       /* should be ok, but just in case ... */
+       sva &= PG_FRAME;
+       eva &= PG_FRAME;
+
+       for (/* null */ ; sva < eva ; sva = blockend) {
+
+               blockend = (sva & PD_MASK) + NBPD;
+               if (blockend > eva)
+                       blockend = eva;
+
+               /*
+                * XXXCDC: our PTE mappings should never be write-protected!
+                *
+                * long term solution is to move the PTEs out of user
+                * address space.  and into kernel address space (up
+                * with APTE).  then we can set VM_MAXUSER_ADDRESS to
+                * be VM_MAX_ADDRESS.
+                */
+
+               /* XXXCDC: ugly hack to avoid freeing PDP here */
+               if (pdei(sva) == PDSLOT_PTE)
+                       continue;
+
+               /* empty block? */
+               if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
+                       continue;
+
+#ifdef DIAGNOSTIC
+               if (sva >= VM_MAXUSER_ADDRESS &&
+                   sva < VM_MAX_ADDRESS)
+                       panic("pmap_write_protect: PTE space");
+#endif
+
+               spte = &ptes[x86_btop(sva)];
+               epte = &ptes[x86_btop(blockend)];
+
+               for (/*null */; spte < epte ; spte++) {
+                       if ((PTE_GET(spte) & (PG_RW|PG_V)) == (PG_RW|PG_V)) {
+                               maptp = (pt_entry_t *)vtomach((vaddr_t)spte);
+                               PTE_ATOMIC_CLEARBITS(spte, maptp, PG_RW);
+                               if (PTE_GET(spte) & PG_M)
+                                       pmap_tlb_shootdown(pmap,
+                                           x86_ptob(spte - ptes),
+                                           PTE_GET(spte), &cpumask);
+                       }
+               }
+       }
+
+       /*
+        * if we kept a removal record and removed some pages update the TLB
+        */
+
+       pmap_tlb_shootnow(cpumask);
+       pmap_unmap_ptes(pmap);          /* unlocks pmap */
+}
+
+/*
+ * end of protection functions
+ */
+
+/*
+ * pmap_unwire: clear the wired bit in the PTE
+ *
+ * => mapping should already be in map
+ */
+
+void
+pmap_unwire(pmap, va)
+       struct pmap *pmap;
+       vaddr_t va;
+{
+       pt_entry_t *ptes;
+       pt_entry_t *maptp;
+
+       if (pmap_valid_entry(pmap->pm_pdir[pdei(va)])) {
+               ptes = pmap_map_ptes(pmap);             /* locks pmap */
+
+#ifdef DIAGNOSTIC
+               if (!pmap_valid_entry(ptes[x86_btop(va)]))
+                       panic("pmap_unwire: invalid (unmapped) va 0x%lx", va);
+#endif
+               if ((ptes[x86_btop(va)] & PG_W) != 0) {
+                       maptp = (pt_entry_t *)vtomach(
+                               (vaddr_t)&ptes[x86_btop(va)]);
+                       PTE_ATOMIC_CLEARBITS(&ptes[x86_btop(va)], maptp, PG_W);
+                       pmap->pm_stats.wired_count--;
+               }
+#ifdef DIAGNOSTIC
+               else {
+                       printf("pmap_unwire: wiring for pmap %p va 0x%lx "
+                              "didn't change!\n", pmap, va);
+               }
+#endif
+               pmap_unmap_ptes(pmap);          /* unlocks map */
+       }
+#ifdef DIAGNOSTIC
+       else {
+               panic("pmap_unwire: invalid PDE");
+       }
+#endif
+}
+
+/*
+ * pmap_collect: free resources held by a pmap
+ *
+ * => optional function.
+ * => called when a process is swapped out to free memory.
+ */
+
+void
+pmap_collect(pmap)
+       struct pmap *pmap;
+{
+       /*
+        * free all of the pt pages by removing the physical mappings
+        * for its entire address space.
+        */
+
+       pmap_do_remove(pmap, VM_MIN_ADDRESS, VM_MAX_ADDRESS,
+           PMAP_REMOVE_SKIPWIRED);
+}
+
+/*
+ * pmap_copy: copy mappings from one pmap to another
+ *
+ * => optional function
+ * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
+ */
+
+/*
+ * defined as macro in pmap.h
+ */
+
+/*
+ * pmap_enter: enter a mapping into a pmap
+ *
+ * => must be done "now" ... no lazy-evaluation
+ * => we set pmap => pv_head locking
+ */
+
+int
+pmap_enter(pmap, va, pa, prot, flags)
+       struct pmap *pmap;
+       vaddr_t va;
+       paddr_t pa;
+       vm_prot_t prot;
+       int flags;
+{
+       pt_entry_t *ptes, opte, npte;
+       struct vm_page *ptp, *pg;
+       struct vm_page_md *mdpg;
+       struct pv_head *old_pvh, *new_pvh;
+       struct pv_entry *pve = NULL; /* XXX gcc */
+       int error;
+       boolean_t wired = (flags & PMAP_WIRED) != 0;
+       pt_entry_t *maptp;
+
+       XENPRINTK(("pmap_enter(%p, %p, %p, %08x, %08x)\n",
+           pmap, (void *)va, (void *)pa, prot, flags));
+
+#ifdef DIAGNOSTIC
+       /* sanity check: totally out of range? */
+       if (va >= VM_MAX_KERNEL_ADDRESS)
+               panic("pmap_enter: too big");
+
+       if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
+               panic("pmap_enter: trying to map over PDP/APDP!");
+
+       /* sanity check: kernel PTPs should already have been pre-allocated */
+       if (va >= VM_MIN_KERNEL_ADDRESS &&
+           !pmap_valid_entry(pmap->pm_pdir[pdei(va)]))
+               panic("pmap_enter: missing kernel PTP!");
+#endif
+
+       npte = protection_codes[prot] | PG_V;
+
+       if (pa >= pmap_pa_start && pa < pmap_pa_end)
+               npte |= xpmap_ptom(pa);
+       else {
+               XENPRINTF(("pmap_enter: va %08lx outside pa range %08lx\n",
+                   va, pa));
+               npte |= pa;
+       }
+
+       /* XENPRINTK(("npte %p\n", npte)); */
+
+       if (wired)
+               npte |= PG_W;
+
+       if (va < VM_MAXUSER_ADDRESS)
+               npte |= PG_u;
+       else if (va < VM_MAX_ADDRESS)
+               npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */
+       if (pmap == pmap_kernel())
+               npte |= pmap_pg_g;
+
+       /* get lock */
+       PMAP_MAP_TO_HEAD_LOCK();
+
+       ptes = pmap_map_ptes(pmap);             /* locks pmap */
+       if (pmap == pmap_kernel()) {
+               ptp = NULL;
+       } else {
+               ptp = pmap_get_ptp(pmap, pdei(va));
+               if (ptp == NULL) {
+                       if (flags & PMAP_CANFAIL) {
+                               error = ENOMEM;
+                               goto out;
+                       }
+                       panic("pmap_enter: get ptp failed");
+               }
+       }
+
+       /*
+        * Get first view on old PTE 
+        * on SMP the PTE might gain PG_U and PG_M flags
+        * before we zap it later
+        */
+       opte = pte_get(&ptes[x86_btop(va)]);            /* old PTE */
+       XENPRINTK(("npte %p opte %p ptes %p idx %03x\n", 
+                     (void *)npte, (void *)opte, ptes, x86_btop(va)));
+
+       /*
+        * is there currently a valid mapping at our VA and does it
+        * map to the same PA as the one we want to map ?
+        */
+
+       if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) {
+
+               /*
+                * first, calculate pm_stats updates.  resident count will not
+                * change since we are replacing/changing a valid mapping.
+                * wired count might change...
+                */
+               pmap->pm_stats.wired_count +=
+                   ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+               npte |= (opte & PG_PVLIST);
+
+               XENPRINTK(("pmap update opte == pa"));
+               /* zap! */
+               maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
+               opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, npte);
+
+               /*
+                * Any change in the protection level that the CPU
+                * should know about ? 
+                */
+               if ((npte & PG_RW)
+                    || ((opte & (PG_M | PG_RW)) != (PG_M | PG_RW))) {
+                       XENPRINTK(("pmap update opte == pa, prot change"));
+                       /*
+                        * No need to flush the TLB.
+                        * Just add old PG_M, ... flags in new entry.
+                        */
+                       PTE_ATOMIC_SETBITS(&ptes[x86_btop(va)], maptp,
+                           opte & (PG_M | PG_U));
+                       goto out_ok;
+               }
+
+               /*
+                * Might be cached in the TLB as being writable
+                * if this is on the PVLIST, sync R/M bit
+                */
+               if (opte & PG_PVLIST) {
+                       pg = PHYS_TO_VM_PAGE(pa);
+#ifdef DIAGNOSTIC
+                       if (pg == NULL)
+                               panic("pmap_enter: same pa PG_PVLIST "
+                                     "mapping with unmanaged page "
+                                     "pa = 0x%lx (0x%lx)", pa,
+                                     atop(pa));
+#endif
+                       mdpg = &pg->mdpage;
+                       old_pvh = &mdpg->mp_pvhead;
+                       simple_lock(&old_pvh->pvh_lock);
+                       mdpg->mp_attrs |= opte;
+                       simple_unlock(&old_pvh->pvh_lock);
+               }
+               goto shootdown_now;
+       }
+
+       pg = PHYS_TO_VM_PAGE(pa);
+       XENPRINTK(("pg %p from %p, init %d\n", pg, (void *)pa,
+                     pmap_initialized));
+       if (pmap_initialized && pg != NULL) {
+               /* This is a managed page */
+               npte |= PG_PVLIST;
+               mdpg = &pg->mdpage;
+               new_pvh = &mdpg->mp_pvhead;
+               if ((opte & (PG_PVLIST | PG_V)) != (PG_PVLIST | PG_V)) {
+                       /* We can not steal a pve - allocate one */
+                       pve = pmap_alloc_pv(pmap, ALLOCPV_NEED);
+                       if (pve == NULL) {
+                               if (!(flags & PMAP_CANFAIL))
+                                       panic("pmap_enter: "
+                                           "no pv entries available");
+                               error = ENOMEM;
+                               goto out;
+                       }
+               }
+       } else {
+               new_pvh = NULL;
+       }
+
+       /*
+        * is there currently a valid mapping at our VA?
+        */
+
+       if (pmap_valid_entry(opte)) {
+
+               /*
+                * changing PAs: we must remove the old one first
+                */
+
+               /*
+                * first, calculate pm_stats updates.  resident count will not
+                * change since we are replacing/changing a valid mapping.
+                * wired count might change...
+                */
+               pmap->pm_stats.wired_count +=
+                   ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+               if (opte & PG_PVLIST) {
+                       pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+#ifdef DIAGNOSTIC
+                       if (pg == NULL)
+                               panic("pmap_enter: PG_PVLIST mapping with "
+                                     "unmanaged page "
+                                     "pa = 0x%lx (0x%lx)", pa, atop(pa));
+#endif
+                       mdpg = &pg->mdpage;
+                       old_pvh = &mdpg->mp_pvhead;
+
+                       /* new_pvh is NULL if page will not be managed */
+                       pmap_lock_pvhs(old_pvh, new_pvh);
+
+                       XENPRINTK(("pmap change pa"));
+                       /* zap! */
+                       maptp = (pt_entry_t *)vtomach(
+                               (vaddr_t)&ptes[x86_btop(va)]);
+                       opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp,
+                                                   npte);
+
+                       pve = pmap_remove_pv(old_pvh, pmap, va);
+                       KASSERT(pve != 0);
+                       mdpg->mp_attrs |= opte;
+
+                       if (new_pvh) {
+                               pmap_enter_pv(new_pvh, pve, pmap, va, ptp);
+                               simple_unlock(&new_pvh->pvh_lock);
+                       } else
+                               pmap_free_pv(pmap, pve);
+                       simple_unlock(&old_pvh->pvh_lock);
+
+                       goto shootdown_test;
+               }
+       } else {        /* opte not valid */
+               pmap->pm_stats.resident_count++;
+               if (wired) 
+                       pmap->pm_stats.wired_count++;
+               if (ptp)
+                       ptp->wire_count++;
+       }
+
+       if (new_pvh) {
+               simple_lock(&new_pvh->pvh_lock);
+               pmap_enter_pv(new_pvh, pve, pmap, va, ptp);
+               simple_unlock(&new_pvh->pvh_lock);
+       }
+
+       XENPRINTK(("pmap initial setup\n"));
+       maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
+       opte = pte_atomic_update_ma(&ptes[x86_btop(va)],
+           maptp, npte); /* zap! */
+
+shootdown_test:
+       /* Update page attributes if needed */
+       if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
+#if defined(MULTIPROCESSOR)
+               int32_t cpumask = 0;
+#endif
+shootdown_now:
+#if defined(MULTIPROCESSOR)
+               pmap_tlb_shootdown(pmap, va, opte, &cpumask);
+               pmap_tlb_shootnow(cpumask);
+#else
+               /* Don't bother deferring in the single CPU case. */
+               if (pmap_is_curpmap(pmap))
+                       pmap_update_pg(va);
+#endif
+       }
+
+out_ok:
+       error = 0;
+
+out:
+       pmap_unmap_ptes(pmap);
+       PMAP_MAP_TO_HEAD_UNLOCK();
+
+       XENPRINTK(("pmap_enter: %d\n", error));
+       return error;
+}
+
+/*
+ * pmap_enter_ma: enter a mapping into a pmap
+ *
+ * => must be done "now" ... no lazy-evaluation
+ * => we set pmap => pv_head locking
+ */
+
+int
+pmap_enter_ma(pmap, va, pa, prot, flags)
+       struct pmap *pmap;
+       vaddr_t va;
+       paddr_t pa;
+       vm_prot_t prot;
+       int flags;
+{
+       pt_entry_t *ptes, opte, npte;
+       pt_entry_t *maptp;
+       struct vm_page *ptp, *pg;
+       struct vm_page_md *mdpg;
+       struct pv_head *old_pvh;
+       struct pv_entry *pve = NULL; /* XXX gcc */
+       int error;
+       boolean_t wired = (flags & PMAP_WIRED) != 0;
+
+       XENPRINTK(("pmap_enter_ma(%p, %p, %p, %08x, %08x)\n",
+           pmap, (void *)va, (void *)pa, prot, flags));
+
+#ifdef DIAGNOSTIC
+       /* sanity check: totally out of range? */
+       if (va >= VM_MAX_KERNEL_ADDRESS)
+               panic("pmap_enter: too big");
+
+       if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
+               panic("pmap_enter: trying to map over PDP/APDP!");
+
+       /* sanity check: kernel PTPs should already have been pre-allocated */
+       if (va >= VM_MIN_KERNEL_ADDRESS &&
+           !pmap_valid_entry(pmap->pm_pdir[pdei(va)]))
+               panic("pmap_enter: missing kernel PTP!");
+#endif
+
+       npte = pa | protection_codes[prot] | PG_V;
+       /* XENPRINTK(("npte %p\n", npte)); */
+
+       if (wired)
+               npte |= PG_W;
+
+       if (va < VM_MAXUSER_ADDRESS)
+               npte |= PG_u;
+       else if (va < VM_MAX_ADDRESS)
+               npte |= (PG_u | PG_RW); /* XXXCDC: no longer needed? */
+       if (pmap == pmap_kernel())
+               npte |= pmap_pg_g;
+
+       /* get lock */
+       PMAP_MAP_TO_HEAD_LOCK();
+
+       ptes = pmap_map_ptes(pmap);             /* locks pmap */
+       if (pmap == pmap_kernel()) {
+               ptp = NULL;
+       } else {
+               ptp = pmap_get_ptp(pmap, pdei(va));
+               if (ptp == NULL) {
+                       if (flags & PMAP_CANFAIL) {
+                               error = ENOMEM;
+                               goto out;
+                       }
+                       panic("pmap_enter: get ptp failed");
+               }
+       }
+
+       /*
+        * Get first view on old PTE 
+        * on SMP the PTE might gain PG_U and PG_M flags
+        * before we zap it later
+        */
+       opte = pte_get_ma(&ptes[x86_btop(va)]);         /* old PTE */
+       XENPRINTK(("npte %p opte %p ptes %p idx %03x\n", 
+                     (void *)npte, (void *)opte, ptes, x86_btop(va)));
+       XENPRINTF(("pmap_enter_ma pa %08lx va %08lx opte %08x npte %08x "
+           "wired %d count %ld\n", pa, va, opte, npte, wired,
+           pmap->pm_stats.wired_count));
+
+       /*
+        * is there currently a valid mapping at our VA and does it
+        * map to the same MA as the one we want to map ?
+        */
+
+       if (pmap_valid_entry(opte) && ((opte & PG_FRAME) == pa)) {
+
+               /*
+                * first, calculate pm_stats updates.  resident count will not
+                * change since we are replacing/changing a valid mapping.
+                * wired count might change...
+                */
+               pmap->pm_stats.wired_count +=
+                   ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+               XENPRINTK(("pmap update opte == pa"));
+               /* zap! */
+               maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
+               opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp, npte);
+
+               /*
+                * Any change in the protection level that the CPU
+                * should know about ? 
+                */
+               if ((npte & PG_RW)
+                    || ((opte & (PG_M | PG_RW)) != (PG_M | PG_RW))) {
+                       XENPRINTK(("pmap update opte == pa, prot change"));
+                       /*
+                        * No need to flush the TLB.
+                        * Just add old PG_M, ... flags in new entry.
+                        */
+                       PTE_ATOMIC_SETBITS(&ptes[x86_btop(va)], maptp,
+                           opte & (PG_M | PG_U));
+                       goto out_ok;
+               }
+
+               /*
+                * Might be cached in the TLB as being writable
+                * if this is on the PVLIST, sync R/M bit
+                */
+               KDASSERT((opte & PG_PVLIST) == 0);
+               goto shootdown_now;
+       }
+
+       /* 
+        * no managed mapping for pages mapped through pmap_enter_ma.
+        */
+
+       /*
+        * is there currently a valid mapping at our VA?
+        */
+
+       if (pmap_valid_entry(opte)) {
+
+               /*
+                * changing PAs: we must remove the old one first
+                */
+
+               /*
+                * first, calculate pm_stats updates.  resident count will not
+                * change since we are replacing/changing a valid mapping.
+                * wired count might change...
+                */
+               pmap->pm_stats.wired_count +=
+                   ((npte & PG_W) ? 1 : 0 - (opte & PG_W) ? 1 : 0);
+
+               if (opte & PG_PVLIST) {
+                       opte = xpmap_mtop(opte);
+                       KDASSERT((opte & PG_FRAME) !=
+                           (KERNTEXTOFF - KERNBASE_LOCORE));
+
+                       pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
+#ifdef DIAGNOSTIC
+                       if (pg == NULL)
+                               panic("pmap_enter: PG_PVLIST mapping with "
+                                     "unmanaged page "
+                                     "pa = 0x%lx (0x%lx)", pa, atop(pa));
+#endif
+                       mdpg = &pg->mdpage;
+                       old_pvh = &mdpg->mp_pvhead;
+
+                       /* NULL new_pvh since page will not be managed */
+                       pmap_lock_pvhs(old_pvh, NULL);
+
+                       XENPRINTK(("pmap change pa"));
+                       /* zap! */
+                       maptp = (pt_entry_t *)vtomach(
+                               (vaddr_t)&ptes[x86_btop(va)]);
+                       opte = pte_atomic_update_ma(&ptes[x86_btop(va)], maptp,
+                                                   npte);
+
+                       pve = pmap_remove_pv(old_pvh, pmap, va);
+                       KASSERT(pve != 0);
+                       mdpg->mp_attrs |= opte;
+
+                       pmap_free_pv(pmap, pve);
+                       simple_unlock(&old_pvh->pvh_lock);
+
+                       goto shootdown_test;
+               }
+       } else {        /* opte not valid */
+               pmap->pm_stats.resident_count++;
+               if (wired) 
+                       pmap->pm_stats.wired_count++;
+               if (ptp)
+                       ptp->wire_count++;
+       }
+
+       XENPRINTK(("pmap initial setup"));
+       maptp = (pt_entry_t *)vtomach((vaddr_t)&ptes[x86_btop(va)]);
+       opte = pte_atomic_update_ma(&ptes[x86_btop(va)],
+           maptp, npte); /* zap! */
+
+shootdown_test:
+       /* Update page attributes if needed */
+       if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
+#if defined(MULTIPROCESSOR)
+               int32_t cpumask = 0;
+#endif
+shootdown_now:
+#if defined(MULTIPROCESSOR)
+               pmap_tlb_shootdown(pmap, va, opte, &cpumask);
+               pmap_tlb_shootnow(cpumask);
+#else
+               /* Don't bother deferring in the single CPU case. */
+               if (pmap_is_curpmap(pmap))
+                       pmap_update_pg(va);
+#endif
+       }
+
+out_ok:
+       error = 0;
+
+out:
+       pmap_unmap_ptes(pmap);
+       PMAP_MAP_TO_HEAD_UNLOCK();
+
+       XENPRINTK(("pmap_enter: %d\n", error));
+       return error;
+}
+
+/*
+ * pmap_growkernel: increase usage of KVM space
+ *
+ * => we allocate new PTPs for the kernel and install them in all
+ *     the pmaps on the system.
+ */
+
+vaddr_t
+pmap_growkernel(maxkvaddr)
+       vaddr_t maxkvaddr;
+{
+       struct pmap *kpm = pmap_kernel(), *pm;
+       pd_entry_t *mapdp;
+       pt_entry_t *maptp;
+       int needed_kpde;   /* needed number of kernel PTPs */
+       int s;
+       paddr_t ptaddr;
+
+       needed_kpde = (u_int)(maxkvaddr - VM_MIN_KERNEL_ADDRESS + (NBPD-1))
+               / NBPD;
+       XENPRINTF(("pmap_growkernel %p: %d -> %d\n", (void *)maxkvaddr,
+                     nkpde, needed_kpde));
+       if (needed_kpde <= nkpde)
+               goto out;               /* we are OK */
+
+       /*
+        * whoops!   we need to add kernel PTPs
+        */
+
+       s = splhigh();  /* to be safe */
+       simple_lock(&kpm->pm_obj.vmobjlock);
+
+       for (/*null*/ ; nkpde < needed_kpde ; nkpde++) {
+
+               mapdp = (pt_entry_t *)vtomach((vaddr_t)&kpm->pm_pdir[PDSLOT_KERN + nkpde]);
+               if (uvm.page_init_done == FALSE) {
+
+                       /*
+                        * we're growing the kernel pmap early (from
+                        * uvm_pageboot_alloc()).  this case must be
+                        * handled a little differently.
+                        */
+
+                       if (uvm_page_physget(&ptaddr) == FALSE)
+                               panic("pmap_growkernel: out of memory");
+                       pmap_zero_page(ptaddr);
+
+                       XENPRINTF(("xxxx maybe not PG_RW\n"));
+                       PDE_SET(&kpm->pm_pdir[PDSLOT_KERN + nkpde], mapdp, ptaddr | PG_RW | PG_V);
+
+                       /* count PTP as resident */
+                       kpm->pm_stats.resident_count++;
+                       continue;
+               }
+
+               /*
+                * THIS *MUST* BE CODED SO AS TO WORK IN THE
+                * pmap_initialized == FALSE CASE!  WE MAY BE
+                * INVOKED WHILE pmap_init() IS RUNNING!
+                */
+
+               if (pmap_alloc_ptp(kpm, PDSLOT_KERN + nkpde) == NULL) {
+                       panic("pmap_growkernel: alloc ptp failed");
+               }
+
+               /* PG_u not for kernel */
+               PDE_CLEARBITS(&kpm->pm_pdir[PDSLOT_KERN + nkpde], mapdp, PG_u);
+
+               /* distribute new kernel PTP to all active pmaps */
+               simple_lock(&pmaps_lock);
+               for (pm = pmaps.lh_first; pm != NULL;
+                    pm = pm->pm_list.le_next) {
+                       XENPRINTF(("update\n"));
+                       maptp = (pt_entry_t *)vtomach(
+                               (vaddr_t)&pm->pm_pdir[PDSLOT_KERN + nkpde]);
+                       PDE_COPY(&pm->pm_pdir[PDSLOT_KERN + nkpde], maptp,
+                           &kpm->pm_pdir[PDSLOT_KERN + nkpde]);
+               }
+
+               /* Invalidate the PDP cache. */
+               pool_cache_invalidate(&pmap_pdp_cache);
+               pmap_pdp_cache_generation++;
+
+               simple_unlock(&pmaps_lock);
+       }
+
+       simple_unlock(&kpm->pm_obj.vmobjlock);
+       splx(s);
+
+out:
+       XENPRINTF(("pmap_growkernel return %d %p\n", nkpde,
+                     (void *)(VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD))));
+       return (VM_MIN_KERNEL_ADDRESS + (nkpde * NBPD));
+}
+
+#ifdef DEBUG
+void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
+
+/*
+ * pmap_dump: dump all the mappings from a pmap
+ *
+ * => caller should not be holding any pmap locks
+ */
+
+void
+pmap_dump(pmap, sva, eva)
+       struct pmap *pmap;
+       vaddr_t sva, eva;
+{
+       pt_entry_t *ptes, *pte;
+       vaddr_t blkendva;
+
+       /*
+        * if end is out of range truncate.
+        * if (end == start) update to max.
+        */
+
+       if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
+               eva = VM_MAXUSER_ADDRESS;
+
+       /*
+        * we lock in the pmap => pv_head direction
+        */
+
+       PMAP_MAP_TO_HEAD_LOCK();
+       ptes = pmap_map_ptes(pmap);     /* locks pmap */
+
+       /*
+        * dumping a range of pages: we dump in PTP sized blocks (4MB)
+        */
+
+       for (/* null */ ; sva < eva ; sva = blkendva) {
+
+               /* determine range of block */
+               blkendva = x86_round_pdr(sva+1);
+               if (blkendva > eva)
+                       blkendva = eva;
+
+               /* valid block? */
+               if (!pmap_valid_entry(pmap->pm_pdir[pdei(sva)]))
+                       continue;
+
+               pte = &ptes[x86_btop(sva)];
+               for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
+                       if (!pmap_valid_entry(*pte))
+                               continue;
+                       XENPRINTF(("va %#lx -> pa %#lx (pte=%#lx)\n",
+                              sva, PTE_GET(pte), PTE_GET(pte) & PG_FRAME));
+               }
+       }
+       pmap_unmap_ptes(pmap);
+       PMAP_MAP_TO_HEAD_UNLOCK();
+}
+#endif
+
+/******************** TLB shootdown code ********************/
+
+
+void
+pmap_tlb_shootnow(int32_t cpumask)
+{
+       struct cpu_info *self;
+#ifdef MULTIPROCESSOR
+       struct cpu_info *ci;
+       CPU_INFO_ITERATOR cii;
+       int s;
+#ifdef DIAGNOSTIC
+       int count = 0;
+#endif
+#endif
+
+       if (cpumask == 0)
+               return;
+
+       self = curcpu();
+#ifdef MULTIPROCESSOR
+       s = splipi();
+       self->ci_tlb_ipi_mask = cpumask;
+#endif
+
+       pmap_do_tlb_shootdown(self);    /* do *our* work. */
+
+#ifdef MULTIPROCESSOR
+       splx(s);
+
+       /*
+        * Send the TLB IPI to other CPUs pending shootdowns.
+        */
+       for (CPU_INFO_FOREACH(cii, ci)) {
+               if (ci == self)
+                       continue;
+               if (cpumask & (1U << ci->ci_cpuid))
+                       if (x86_send_ipi(ci, X86_IPI_TLB) != 0)
+                               x86_atomic_clearbits_l(&self->ci_tlb_ipi_mask,
+                                   (1U << ci->ci_cpuid));
+       }
+
+       while (self->ci_tlb_ipi_mask != 0) {
+#ifdef DIAGNOSTIC
+               if (count++ > 10000000)
+                       panic("TLB IPI rendezvous failed (mask %x)",
+                           self->ci_tlb_ipi_mask);
+#endif
+               x86_pause();
+       }
+#endif
+}
+
+/*
+ * pmap_tlb_shootdown:
+ *
+ *     Cause the TLB entry for pmap/va to be shot down.
+ */
+void
+pmap_tlb_shootdown(pmap, va, pte, cpumaskp)
+       pmap_t pmap;
+       vaddr_t va;
+       pt_entry_t pte;
+       int32_t *cpumaskp;
+{
+       struct cpu_info *ci, *self;
+       struct pmap_tlb_shootdown_q *pq;
+       struct pmap_tlb_shootdown_job *pj;
+       CPU_INFO_ITERATOR cii;
+       int s;
+
+#ifdef LARGEPAGES
+       if (pte & PG_PS)
+               va &= PG_LGFRAME;
+#endif
+
+       if (pmap_initialized == FALSE || cpus_attached == 0) {
+               pmap_update_pg(va);
+               return;
+       }
+
+       self = curcpu();
+
+       s = splipi();
+#if 0
+       printf("dshootdown %lx\n", va);
+#endif
+
+       for (CPU_INFO_FOREACH(cii, ci)) {
+               /* Note: we queue shootdown events for ourselves here! */
+               if (pmap_is_active(pmap, ci->ci_cpuid) == 0)
+                       continue;
+               if (ci != self && !(ci->ci_flags & CPUF_RUNNING))
+                       continue;
+               pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
+               __cpu_simple_lock(&pq->pq_slock);
+
+               /*
+                * If there's a global flush already queued, or a
+                * non-global flush, and this pte doesn't have the G
+                * bit set, don't bother.
+                */
+               if (pq->pq_flushg > 0 ||
+                   (pq->pq_flushu > 0 && (pte & pmap_pg_g) == 0)) {
+                       __cpu_simple_unlock(&pq->pq_slock);
+                       continue;
+               }
+
+#ifdef I386_CPU
+               /*
+                * i386 CPUs can't invalidate a single VA, only
+                * flush the entire TLB, so don't bother allocating
+                * jobs for them -- just queue a `flushu'.
+                *
+                * XXX note that this can be executed for non-i386
+                * when called * early (before identifycpu() has set
+                * cpu_class)
+                */
+               if (cpu_class == CPUCLASS_386) {
+                       pq->pq_flushu++;
+                       *cpumaskp |= 1U << ci->ci_cpuid;
+                       __cpu_simple_unlock(&pq->pq_slock);
+                       continue;
+               }
+#endif
+
+               pj = pmap_tlb_shootdown_job_get(pq);
+               pq->pq_pte |= pte;
+               if (pj == NULL) {
+                       /*
+                        * Couldn't allocate a job entry.
+                        * Kill it now for this CPU, unless the failure
+                        * was due to too many pending flushes; otherwise,
+                        * tell other cpus to kill everything..
+                        */
+                       if (ci == self && pq->pq_count < PMAP_TLB_MAXJOBS) {
+                               pmap_update_pg(va);
+                               __cpu_simple_unlock(&pq->pq_slock);
+                               continue;
+                       } else {
+                               if (pq->pq_pte & pmap_pg_g)
+                                       pq->pq_flushg++;
+                               else
+                                       pq->pq_flushu++;
+                               /*
+                                * Since we've nailed the whole thing,
+                                * drain the job entries pending for that
+                                * processor.
+                                */
+                               pmap_tlb_shootdown_q_drain(pq);
+                               *cpumaskp |= 1U << ci->ci_cpuid;
+                       }
+               } else {
+                       pj->pj_pmap = pmap;
+                       pj->pj_va = va;
+                       pj->pj_pte = pte;
+                       TAILQ_INSERT_TAIL(&pq->pq_head, pj, pj_list);
+                       *cpumaskp |= 1U << ci->ci_cpuid;
+               }
+               __cpu_simple_unlock(&pq->pq_slock);
+       }
+       splx(s);
+}
+
+/*
+ * pmap_do_tlb_shootdown_checktlbstate: check and update ci_tlbstate.
+ *
+ * => called at splipi.
+ * => return TRUE if we need to maintain user tlbs.
+ */
+static __inline boolean_t
+pmap_do_tlb_shootdown_checktlbstate(struct cpu_info *ci)
+{
+
+       KASSERT(ci == curcpu());
+
+       if (ci->ci_tlbstate == TLBSTATE_LAZY) {
+               KASSERT(ci->ci_pmap != pmap_kernel());
+               /*
+                * mostly KASSERT(ci->ci_pmap->pm_cpus & (1U << ci->ci_cpuid));
+                */
+
+               /*
+                * we no longer want tlb shootdown ipis for this pmap.
+                * mark the pmap no longer in use by this processor.
+                */
+
+               x86_atomic_clearbits_l(&ci->ci_pmap->pm_cpus,
+                   1U << ci->ci_cpuid);
+               ci->ci_tlbstate = TLBSTATE_STALE;
+       }
+
+       if (ci->ci_tlbstate == TLBSTATE_STALE)
+               return FALSE;
+
+       return TRUE;
+}
+
+/*
+ * pmap_do_tlb_shootdown:
+ *
+ *     Process pending TLB shootdown operations for this processor.
+ */
+void
+pmap_do_tlb_shootdown(struct cpu_info *self)
+{
+       u_long cpu_id = self->ci_cpuid;
+       struct pmap_tlb_shootdown_q *pq = &pmap_tlb_shootdown_q[cpu_id];
+       struct pmap_tlb_shootdown_job *pj;
+       int s;
+#ifdef MULTIPROCESSOR
+       struct cpu_info *ci;
+       CPU_INFO_ITERATOR cii;
+#endif
+       KASSERT(self == curcpu());
+
+       s = splipi();
+
+       __cpu_simple_lock(&pq->pq_slock);
+
+       if (pq->pq_flushg) {
+               COUNT(flushg);
+               pmap_do_tlb_shootdown_checktlbstate(self);
+               tlbflushg();
+               pq->pq_flushg = 0;
+               pq->pq_flushu = 0;
+               pmap_tlb_shootdown_q_drain(pq);
+       } else {
+               /*
+                * TLB flushes for PTEs with PG_G set may be in the queue
+                * after a flushu, they need to be dealt with.
+                */
+               if (pq->pq_flushu) {
+                       COUNT(flushu);
+                       pmap_do_tlb_shootdown_checktlbstate(self);
+                       tlbflush();
+               }
+               while ((pj = TAILQ_FIRST(&pq->pq_head)) != NULL) {
+                       TAILQ_REMOVE(&pq->pq_head, pj, pj_list);
+
+                       if ((pj->pj_pte & pmap_pg_g) ||
+                           pj->pj_pmap == pmap_kernel()) {
+                               pmap_update_pg(pj->pj_va);
+                       } else if (!pq->pq_flushu &&
+                           pj->pj_pmap == self->ci_pmap) {
+                               if (pmap_do_tlb_shootdown_checktlbstate(self))
+                                       pmap_update_pg(pj->pj_va);
+                       }
+
+                       pmap_tlb_shootdown_job_put(pq, pj);
+               }
+
+               pq->pq_flushu = pq->pq_pte = 0;
+       }
+
+#ifdef MULTIPROCESSOR
+       for (CPU_INFO_FOREACH(cii, ci))
+               x86_atomic_clearbits_l(&ci->ci_tlb_ipi_mask,
+                   (1U << cpu_id));
+#endif
+       __cpu_simple_unlock(&pq->pq_slock);
+
+       splx(s);
+}
+
+
+/*
+ * pmap_tlb_shootdown_q_drain:
+ *
+ *     Drain a processor's TLB shootdown queue.  We do not perform
+ *     the shootdown operations.  This is merely a convenience
+ *     function.
+ *
+ *     Note: We expect the queue to be locked.
+ */
+void
+pmap_tlb_shootdown_q_drain(pq)
+       struct pmap_tlb_shootdown_q *pq;
+{
+       struct pmap_tlb_shootdown_job *pj;
+
+       while ((pj = TAILQ_FIRST(&pq->pq_head)) != NULL) {
+               TAILQ_REMOVE(&pq->pq_head, pj, pj_list);
+               pmap_tlb_shootdown_job_put(pq, pj);
+       }
+       pq->pq_pte = 0;
+}
+
+/*
+ * pmap_tlb_shootdown_job_get:
+ *
+ *     Get a TLB shootdown job queue entry.  This places a limit on
+ *     the number of outstanding jobs a processor may have.
+ *
+ *     Note: We expect the queue to be locked.
+ */
+struct pmap_tlb_shootdown_job *
+pmap_tlb_shootdown_job_get(pq)
+       struct pmap_tlb_shootdown_q *pq;
+{
+       struct pmap_tlb_shootdown_job *pj;
+
+       if (pq->pq_count >= PMAP_TLB_MAXJOBS)
+               return (NULL);
+
+       __cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
+       if (pj_free == NULL) {
+               __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
+               return NULL;
+       }
+       pj = &pj_free->pja_job;
+       pj_free =
+           (union pmap_tlb_shootdown_job_al *)pj_free->pja_job.pj_nextfree;
+       __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
+
+       pq->pq_count++;
+       return (pj);
+}
+
+/*
+ * pmap_tlb_shootdown_job_put:
+ *
+ *     Put a TLB shootdown job queue entry onto the free list.
+ *
+ *     Note: We expect the queue to be locked.
+ */
+void
+pmap_tlb_shootdown_job_put(pq, pj)
+       struct pmap_tlb_shootdown_q *pq;
+       struct pmap_tlb_shootdown_job *pj;
+{
+
+#ifdef DIAGNOSTIC
+       if (pq->pq_count == 0)
+               panic("pmap_tlb_shootdown_job_put: queue length inconsistency");
+#endif
+       __cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
+       pj->pj_nextfree = &pj_free->pja_job;
+       pj_free = (union pmap_tlb_shootdown_job_al *)pj;
+       __cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
+
+       pq->pq_count--;
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c

new file mode 100644 (file)

index 0000000..d65741f
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c
@@ -0,0 +1,550 @@
+/*     $NetBSD: sys_machdep.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $   */
+/*     NetBSD: sys_machdep.c,v 1.70 2003/10/27 14:11:47 junyoung Exp   */
+
+/*-
+ * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: sys_machdep.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $");
+
+#include "opt_compat_netbsd.h"
+#include "opt_mtrr.h"
+#include "opt_perfctrs.h"
+#include "opt_user_ldt.h"
+#include "opt_vm86.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ioctl.h>
+#include <sys/file.h>
+#include <sys/time.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/buf.h>
+#include <sys/signal.h>
+#include <sys/malloc.h>
+
+#include <sys/mount.h>
+#include <sys/sa.h>
+#include <sys/syscallargs.h>
+
+#include <uvm/uvm_extern.h>
+
+#include <machine/cpu.h>
+#include <machine/cpufunc.h>
+#include <machine/gdt.h>
+#include <machine/psl.h>
+#include <machine/reg.h>
+#include <machine/sysarch.h>
+#include <machine/mtrr.h>
+
+#ifdef VM86
+#include <machine/vm86.h>
+#endif
+
+#ifdef PERFCTRS
+#include <machine/pmc.h>
+#endif
+
+extern struct vm_map *kernel_map;
+
+int i386_iopl(struct lwp *, void *, register_t *);
+int i386_get_ioperm(struct lwp *, void *, register_t *);
+int i386_set_ioperm(struct lwp *, void *, register_t *);
+int i386_get_mtrr(struct lwp *, void *, register_t *);
+int i386_set_mtrr(struct lwp *, void *, register_t *);
+
+#ifdef USER_LDT
+
+#ifdef LDT_DEBUG
+static void i386_print_ldt(int, const struct segment_descriptor *);
+
+static void
+i386_print_ldt(i, d)
+       int  i;
+       const struct segment_descriptor *d;
+{
+       printf("[%d] lolimit=0x%x, lobase=0x%x, type=%u, dpl=%u, p=%u, "
+           "hilimit=0x%x, xx=%x, def32=%u, gran=%u, hibase=0x%x\n",
+           i, d->sd_lolimit, d->sd_lobase, d->sd_type, d->sd_dpl, d->sd_p,
+           d->sd_hilimit, d->sd_xx, d->sd_def32, d->sd_gran, d->sd_hibase);
+}
+#endif
+
+int
+i386_get_ldt(l, args, retval)
+       struct lwp *l;
+       void *args;
+       register_t *retval;
+{
+       int error;
+       struct proc *p = l->l_proc;
+       pmap_t pmap = p->p_vmspace->vm_map.pmap;
+       int nldt, num;
+       union descriptor *lp, *cp;
+       struct i386_get_ldt_args ua;
+
+       if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+               return (error);
+
+#ifdef LDT_DEBUG
+       printf("i386_get_ldt: start=%d num=%d descs=%p\n", ua.start,
+           ua.num, ua.desc);
+#endif
+
+       if (ua.start < 0 || ua.num < 0 || ua.start > 8192 || ua.num > 8192 ||
+           ua.start + ua.num > 8192)
+               return (EINVAL);
+
+       cp = malloc(ua.num * sizeof(union descriptor), M_TEMP, M_WAITOK);
+       if (cp == NULL)
+               return ENOMEM;
+
+       simple_lock(&pmap->pm_lock);
+
+       if (pmap->pm_flags & PMF_USER_LDT) {
+               nldt = pmap->pm_ldt_len;
+               lp = pmap->pm_ldt;
+       } else {
+               nldt = NLDT;
+               lp = ldt;
+       }
+
+       if (ua.start > nldt) {
+               simple_unlock(&pmap->pm_lock);
+               free(cp, M_TEMP);
+               return (EINVAL);
+       }
+
+       lp += ua.start;
+       num = min(ua.num, nldt - ua.start);
+#ifdef LDT_DEBUG
+       {
+               int i;
+               for (i = 0; i < num; i++)
+                       i386_print_ldt(i, &lp[i].sd);
+       }
+#endif
+
+       memcpy(cp, lp, num * sizeof(union descriptor));
+       simple_unlock(&pmap->pm_lock);
+
+       error = copyout(cp, ua.desc, num * sizeof(union descriptor));
+       if (error == 0)
+               *retval = num;
+
+       free(cp, M_TEMP);
+       return (error);
+}
+
+int
+i386_set_ldt(l, args, retval)
+       struct lwp *l;
+       void *args;
+       register_t *retval;
+{
+       int error, i, n;
+       struct proc *p = l->l_proc;
+       struct pcb *pcb = &l->l_addr->u_pcb;
+       pmap_t pmap = p->p_vmspace->vm_map.pmap;
+       struct i386_set_ldt_args ua;
+       union descriptor *descv;
+       size_t old_len, new_len, ldt_len;
+       union descriptor *old_ldt, *new_ldt;
+
+       if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+               return (error);
+
+       if (ua.start < 0 || ua.num < 0 || ua.start > 8192 || ua.num > 8192 ||
+           ua.start + ua.num > 8192)
+               return (EINVAL);
+
+       descv = malloc(sizeof (*descv) * ua.num, M_TEMP, M_NOWAIT);
+       if (descv == NULL)
+               return (ENOMEM);
+
+       if ((error = copyin(ua.desc, descv, sizeof (*descv) * ua.num)) != 0)
+               goto out;
+
+       /* Check descriptors for access violations. */
+       for (i = 0; i < ua.num; i++) {
+               union descriptor *desc = &descv[i];
+
+               switch (desc->sd.sd_type) {
+               case SDT_SYSNULL:
+                       desc->sd.sd_p = 0;
+                       break;
+               case SDT_SYS286CGT:
+               case SDT_SYS386CGT:
+                       /*
+                        * Only allow call gates targeting a segment
+                        * in the LDT or a user segment in the fixed
+                        * part of the gdt.  Segments in the LDT are
+                        * constrained (below) to be user segments.
+                        */
+                       if (desc->gd.gd_p != 0 &&
+                           !ISLDT(desc->gd.gd_selector) &&
+                           ((IDXSEL(desc->gd.gd_selector) >= NGDT) ||
+                            (gdt[IDXSEL(desc->gd.gd_selector)].sd.sd_dpl !=
+                                SEL_UPL))) {
+                               error = EACCES;
+                               goto out;
+                       }
+                       break;
+               case SDT_MEMEC:
+               case SDT_MEMEAC:
+               case SDT_MEMERC:
+               case SDT_MEMERAC:
+                       /* Must be "present" if executable and conforming. */
+                       if (desc->sd.sd_p == 0) {
+                               error = EACCES;
+                               goto out;
+                       }
+                       break;
+               case SDT_MEMRO:
+               case SDT_MEMROA:
+               case SDT_MEMRW:
+               case SDT_MEMRWA:
+               case SDT_MEMROD:
+               case SDT_MEMRODA:
+               case SDT_MEMRWD:
+               case SDT_MEMRWDA:
+               case SDT_MEME:
+               case SDT_MEMEA:
+               case SDT_MEMER:
+               case SDT_MEMERA:
+                       break;
+               default:
+                       /*
+                        * Make sure that unknown descriptor types are
+                        * not marked present.
+                        */
+                       if (desc->sd.sd_p != 0) {
+                               error = EACCES;
+                               goto out;
+                       }
+                       break;
+               }
+
+               if (desc->sd.sd_p != 0) {
+                       /* Only user (ring-3) descriptors may be present. */
+                       if (desc->sd.sd_dpl != SEL_UPL) {
+                               error = EACCES;
+                               goto out;
+                       }
+               }
+       }
+
+       /* allocate user ldt */
+       simple_lock(&pmap->pm_lock);
+       if (pmap->pm_ldt == 0 || (ua.start + ua.num) > pmap->pm_ldt_len) {
+               if (pmap->pm_flags & PMF_USER_LDT)
+                       ldt_len = pmap->pm_ldt_len;
+               else
+                       ldt_len = 512;
+               while ((ua.start + ua.num) > ldt_len)
+                       ldt_len *= 2;
+               new_len = ldt_len * sizeof(union descriptor);
+
+               simple_unlock(&pmap->pm_lock);
+               new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
+                   new_len);
+               simple_lock(&pmap->pm_lock);
+
+               if (pmap->pm_ldt != NULL && ldt_len <= pmap->pm_ldt_len) {
+                       /*
+                        * Another thread (re)allocated the LDT to
+                        * sufficient size while we were blocked in
+                        * uvm_km_alloc. Oh well. The new entries
+                        * will quite probably not be right, but
+                        * hey.. not our problem if user applications
+                        * have race conditions like that.
+                        */
+                       uvm_km_free(kernel_map, (vaddr_t)new_ldt, new_len);
+                       goto copy;
+               }
+
+               old_ldt = pmap->pm_ldt;
+
+               if (old_ldt != NULL) {
+                       old_len = pmap->pm_ldt_len * sizeof(union descriptor);
+               } else {
+                       old_len = NLDT * sizeof(union descriptor);
+                       old_ldt = ldt;
+               }
+
+               memcpy(new_ldt, old_ldt, old_len);
+               memset((caddr_t)new_ldt + old_len, 0, new_len - old_len);
+
+               if (old_ldt != ldt)
+                       uvm_km_free(kernel_map, (vaddr_t)old_ldt, old_len);
+
+               pmap->pm_ldt = new_ldt;
+               pmap->pm_ldt_len = ldt_len;
+
+               if (pmap->pm_flags & PMF_USER_LDT)
+                       ldt_free(pmap);
+               else
+                       pmap->pm_flags |= PMF_USER_LDT;
+               ldt_alloc(pmap, new_ldt, new_len);
+               pcb->pcb_ldt_sel = pmap->pm_ldt_sel;
+               if (pcb == curpcb)
+                       lldt(pcb->pcb_ldt_sel);
+
+       }
+copy:
+       /* Now actually replace the descriptors. */
+       for (i = 0, n = ua.start; i < ua.num; i++, n++)
+               pmap->pm_ldt[n] = descv[i];
+
+       simple_unlock(&pmap->pm_lock);
+
+       *retval = ua.start;
+
+out:
+       free(descv, M_TEMP);
+       return (error);
+}
+#endif /* USER_LDT */
+
+int
+i386_iopl(l, args, retval)
+       struct lwp *l;
+       void *args;
+       register_t *retval;
+{
+       int error;
+       struct proc *p = l->l_proc;
+       struct pcb *pcb = &l->l_addr->u_pcb;
+       struct i386_iopl_args ua;
+       dom0_op_t op;
+
+       if ((xen_start_info.flags & SIF_PRIVILEGED) == 0)
+               return EPERM;
+
+       if (securelevel > 1)
+               return EPERM;
+
+       if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
+               return error;
+
+       if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+               return error;
+
+       pcb->pcb_tss.tss_ioopt &= ~SEL_RPL;
+       if (ua.iopl)
+               pcb->pcb_tss.tss_ioopt |= SEL_UPL; /* i/o pl */
+       else
+               pcb->pcb_tss.tss_ioopt |= SEL_KPL; /* i/o pl */
+
+       /* Force the change at ring 0. */
+       op.cmd = DOM0_IOPL;
+       op.u.iopl.domain = DOMID_SELF;
+       op.u.iopl.iopl = pcb->pcb_tss.tss_ioopt & SEL_RPL; /* i/o pl */
+       HYPERVISOR_dom0_op(&op);
+
+       return 0;
+}
+
+int
+i386_get_ioperm(l, args, retval)
+       struct lwp *l;
+       void *args;
+       register_t *retval;
+{
+       int error;
+       struct pcb *pcb = &l->l_addr->u_pcb;
+       struct i386_get_ioperm_args ua;
+
+       if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+               return (error);
+
+       return copyout(pcb->pcb_iomap, ua.iomap, sizeof(pcb->pcb_iomap));
+}
+
+int
+i386_set_ioperm(l, args, retval)
+       struct lwp *l;
+       void *args;
+       register_t *retval;
+{
+       int error;
+       struct proc *p = l->l_proc;
+       struct pcb *pcb = &l->l_addr->u_pcb;
+       struct i386_set_ioperm_args ua;
+
+       if (securelevel > 1)
+               return EPERM;
+
+       if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
+               return error;
+
+       if ((error = copyin(args, &ua, sizeof(ua))) != 0)
+               return (error);
+
+       return copyin(ua.iomap, pcb->pcb_iomap, sizeof(pcb->pcb_iomap));
+}
+
+#ifdef MTRR
+int
+i386_get_mtrr(struct lwp *l, void *args, register_t *retval)
+{
+       struct i386_get_mtrr_args ua;
+       int error, n;
+       struct proc *p = l->l_proc;
+
+       if (mtrr_funcs == NULL)
+               return ENOSYS;
+
+       error = copyin(args, &ua, sizeof ua);
+       if (error != 0)
+               return error;
+
+       error = copyin(ua.n, &n, sizeof n);
+       if (error != 0)
+               return error;
+
+       error = mtrr_get(ua.mtrrp, &n, p, MTRR_GETSET_USER);
+
+       copyout(&n, ua.n, sizeof (int));
+
+       return error;
+}
+
+int
+i386_set_mtrr(struct lwp *l, void *args, register_t *retval)
+{
+       int error, n;
+       struct i386_set_mtrr_args ua;
+       struct proc *p = l->l_proc;
+
+       if (mtrr_funcs == NULL)
+               return ENOSYS;
+
+       error = suser(p->p_ucred, &p->p_acflag);
+       if (error != 0)
+               return error;
+
+       error = copyin(args, &ua, sizeof ua);
+       if (error != 0)
+               return error;
+
+       error = copyin(ua.n, &n, sizeof n);
+       if (error != 0)
+               return error;
+
+       error = mtrr_set(ua.mtrrp, &n, p, MTRR_GETSET_USER);
+       if (n != 0)
+               mtrr_commit();
+
+       copyout(&n, ua.n, sizeof n);
+
+       return error;
+}
+#endif
+
+int
+sys_sysarch(struct lwp *l, void *v, register_t *retval)
+{
+       struct sys_sysarch_args /* {
+               syscallarg(int) op;
+               syscallarg(void *) parms;
+       } */ *uap = v;
+       int error = 0;
+
+       switch(SCARG(uap, op)) {
+#ifdef USER_LDT
+       case I386_GET_LDT: 
+               error = i386_get_ldt(l, SCARG(uap, parms), retval);
+               break;
+
+       case I386_SET_LDT: 
+               error = i386_set_ldt(l, SCARG(uap, parms), retval);
+               break;
+#endif
+
+       case I386_IOPL: 
+               error = i386_iopl(l, SCARG(uap, parms), retval);
+               break;
+
+       case I386_GET_IOPERM: 
+               error = i386_get_ioperm(l, SCARG(uap, parms), retval);
+               break;
+
+       case I386_SET_IOPERM: 
+               error = i386_set_ioperm(l, SCARG(uap, parms), retval);
+               break;
+
+#ifdef VM86
+       case I386_VM86:
+               error = i386_vm86(l, SCARG(uap, parms), retval);
+               break;
+#ifdef COMPAT_16
+       case I386_OLD_VM86:
+               error = compat_16_i386_vm86(l, SCARG(uap, parms), retval);
+               break;
+#endif
+#endif
+#ifdef MTRR
+       case I386_GET_MTRR:
+               error = i386_get_mtrr(l, SCARG(uap, parms), retval);
+               break;
+       case I386_SET_MTRR:
+               error = i386_set_mtrr(l, SCARG(uap, parms), retval);
+               break;
+#endif
+#ifdef PERFCTRS
+       case I386_PMC_INFO:
+               error = pmc_info(l, SCARG(uap, parms), retval);
+               break;
+
+       case I386_PMC_STARTSTOP:
+               error = pmc_startstop(l, SCARG(uap, parms), retval);
+               break;
+
+       case I386_PMC_READ:
+               error = pmc_read(l, SCARG(uap, parms), retval);
+               break;
+#endif
+
+       default:
+               error = EINVAL;
+               break;
+       }
+       return (error);
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S

new file mode 100644 (file)

index 0000000..165b5f0
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S
@@ -0,0 +1,1587 @@
+/*     $NetBSD: vector.S,v 1.1.2.1 2004/05/22 15:57:16 he Exp $        */
+/*     NetBSD: 1.13 2004/03/11 11:39:26 yamt Exp       */
+
+/*
+ * Copyright 2002 (c) Wasabi Systems, Inc.
+ * All rights reserved.
+ *
+ * Written by Frank van der Linden for Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed for the NetBSD Project by
+ *      Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ *    or promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_ddb.h"
+#include "opt_multiprocessor.h"
+#include "opt_ipkdb.h"
+#include "opt_vm86.h"
+#include "opt_xen.h"
+
+#ifndef XEN
+#include <machine/i8259.h>
+#endif
+#include <machine/i82093reg.h>
+#include <machine/i82489reg.h>
+#include <machine/asm.h>
+#include <machine/frameasm.h>
+#include <machine/segments.h>
+#include <machine/trap.h>
+#include <machine/intr.h>
+#include <machine/psl.h>
+#ifdef XEN
+#include <machine/xen.h>
+#endif
+
+#include <net/netisr.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+
+#include "npx.h"
+#include "assym.h"
+
+#define __HAVE_GENERIC_SOFT_INTERRUPTS /* XXX */
+
+
+/*
+ * Macros for interrupt entry, call to handler, and exit.
+ *
+ * XXX
+ * The interrupt frame is set up to look like a trap frame.  This may be a
+ * waste.  The only handler which needs a frame is the clock handler, and it
+ * only needs a few bits.  Xdoreti() needs a trap frame for handling ASTs, but
+ * it could easily convert the frame on demand.
+ *
+ * The direct costs of setting up a trap frame are two pushl's (error code and
+ * trap number), an addl to get rid of these, and pushing and popping the
+ * callee-saved registers %esi, %edi, %ebx, and %ebp twice.
+ *
+ * If the interrupt frame is made more flexible,  INTR can push %eax first and
+ * decide the ipending case with less overhead, e.g., by avoiding loading the
+ * segment registers.
+ *
+ */
+
+#define MY_COUNT _C_LABEL(uvmexp)
+
+/* XXX See comment in locore.s */
+#ifdef __ELF__
+#define        XINTR(name,num)         Xintr_/**/name/**/num
+#define        XSTRAY(name,num)        Xstray_/**/name/**/num
+#define XINTR_TSS(irq_num)     Xintr_tss_ ## irq_num
+#else
+#define        XINTR(name,num)         _Xintr_/**/name/**/num
+#define        XSTRAY(name,num)        _Xstray_/**/name/**/num
+#define XINTR_TSS(irq_num)     Xintr_tss_/**/irq_num
+#endif
+
+/*
+ * Store address of TSS in %eax, given a selector in %eax.
+ * Clobbers %eax, %ecx, %edx, but that's ok for its usage.
+ * This is a bit complicated, but it's done to make as few
+ * assumptions as possible about the validity of the environment.
+ * The GDT and the current and previous TSS are known to be OK,
+ * otherwise we would not be here. The only other thing that needs
+ * to be OK is the cpu_info structure for the current CPU.
+ */
+#define GET_TSS \
+       andl    $0xfff8,%eax                            ;\
+       addl    CPUVAR(GDT),%eax                        ;\
+       movl    2(%eax),%edx                            ;\
+       andl    $0xffffff,%edx                          ;\
+       movzbl  7(%eax),%eax                            ;\
+       shl     $24,%eax                                ;\
+       orl     %edx,%eax
+
+#if NLAPIC > 0
+#ifdef MULTIPROCESSOR
+IDTVEC(recurse_lapic_ipi)
+       pushfl
+       pushl   %cs
+       pushl   %esi
+       pushl   $0
+       pushl   $T_ASTFLT
+       INTRENTRY
+IDTVEC(resume_lapic_ipi)
+       cli
+       jmp     1f
+IDTVEC(intr_lapic_ipi)
+       pushl   $0
+       pushl   $T_ASTFLT
+       INTRENTRY
+       movl    $0,_C_LABEL(local_apic)+LAPIC_EOI
+       movl    CPUVAR(ILEVEL),%ebx
+       cmpl    $IPL_IPI,%ebx
+       jae     2f
+1:
+       incl    CPUVAR(IDEPTH)
+       movl    $IPL_IPI,CPUVAR(ILEVEL)
+        sti
+       pushl   %ebx
+       call    _C_LABEL(x86_ipi_handler)
+       jmp     _C_LABEL(Xdoreti)
+2:
+       orl     $(1 << LIR_IPI),CPUVAR(IPENDING)
+       sti
+       INTRFASTEXIT
+
+#if defined(DDB)
+IDTVEC(intrddbipi)
+1:
+       str     %ax
+       GET_TSS
+       movzwl  (%eax),%eax
+       GET_TSS
+       pushl   %eax
+       movl    $0xff,_C_LABEL(lapic_tpr)
+       movl    $0,_C_LABEL(local_apic)+LAPIC_EOI
+       sti
+       call    _C_LABEL(ddb_ipi_tss)
+       addl    $4,%esp
+       movl    $0,_C_LABEL(lapic_tpr)
+       iret
+       jmp     1b
+#endif /* DDB */
+#endif /* MULTIPROCESSOR */
+
+       /*
+        * Interrupt from the local APIC timer.
+        */
+IDTVEC(recurse_lapic_ltimer)
+       pushfl
+       pushl   %cs
+       pushl   %esi
+       pushl   $0
+       pushl   $T_ASTFLT
+       INTRENTRY
+IDTVEC(resume_lapic_ltimer)
+       cli
+       jmp     1f
+IDTVEC(intr_lapic_ltimer)
+       pushl   $0
+       pushl   $T_ASTFLT
+       INTRENTRY
+       movl    $0,_C_LABEL(local_apic)+LAPIC_EOI
+       movl    CPUVAR(ILEVEL),%ebx
+       cmpl    $IPL_CLOCK,%ebx
+       jae     2f
+1:
+       incl    CPUVAR(IDEPTH)
+       movl    $IPL_CLOCK,CPUVAR(ILEVEL)
+       sti
+       pushl   %ebx
+       pushl   $0
+       call    _C_LABEL(lapic_clockintr)
+       addl    $4,%esp
+       jmp     _C_LABEL(Xdoreti)
+2:
+       orl     $(1 << LIR_TIMER),CPUVAR(IPENDING)
+       sti
+       INTRFASTEXIT
+#endif /* NLAPIC > 0 */
+
+#ifdef MULTIPROCESSOR
+#define LOCK_KERNEL    pushl %esp ; call _C_LABEL(x86_intlock) ; addl $4,%esp
+#define UNLOCK_KERNEL  pushl %esp ; call _C_LABEL(x86_intunlock) ; addl $4,%esp
+#else
+#define LOCK_KERNEL
+#define UNLOCK_KERNEL
+#endif
+
+#define voidop(num)
+
+
+#define        XENINTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \
+IDTVEC(recurse_/**/name/**/num)                                                ;\
+       pushfl                                                          ;\
+       pushl   %cs                                                     ;\
+       pushl   %esi                                                    ;\
+       subl    $4,%esp                                                 ;\
+       pushl   $T_ASTFLT               /* trap # for doing ASTs */     ;\
+       INTRENTRY                                                       ;\
+IDTVEC(resume_/**/name/**/num)                                         \
+       /*movl  %esp,%ecx*/                                             ;\
+       movl    $IREENT_MAGIC,TF_ERR(%esp)                              ;\
+       movl    %ebx,%esi                                               ;\
+       movl    CPUVAR(ISOURCES) + (num) * 4, %ebp                      ;\
+       movl    IS_MAXLEVEL(%ebp),%ebx                                  ;\
+       jmp     1f                                                      ;\
+IDTVEC(intr_/**/name/**/num)                                           ;\
+       pushl   $0                      /* dummy error code */          ;\
+       pushl   $T_ASTFLT               /* trap # for doing ASTs */     ;\
+       INTRENTRY                                                       ;\
+       /*movl  %esp,%ecx*/                                             ;\
+       movl    CPUVAR(ISOURCES) + (num) * 4, %ebp              ;\
+       mask(num)               /* mask it in hardware */       ;\
+       early_ack(num)                  /* and allow other intrs */     ;\
+       testl   %ebp,%ebp                                               ;\
+       jz      9f                      /* stray */                     ;\
+       movl    IS_MAXLEVEL(%ebp),%ebx                                  ;\
+       movl    CPUVAR(ILEVEL),%esi                                     ;\
+       cmpl    %ebx,%esi                                               ;\
+       jae     10f                     /* currently masked; hold it */ ;\
+       incl    MY_COUNT+V_INTR         /* statistical info */          ;\
+       addl    $1,IS_EVCNTLO(%ebp)     /* inc event counter */         ;\
+       adcl    $0,IS_EVCNTHI(%ebp)                                     ;\
+1:                                                                     \
+       pushl   %esi                                                    ;\
+       movl    %ebx,CPUVAR(ILEVEL)                                     ;\
+       STI(%eax)                                                       ;\
+       incl    CPUVAR(IDEPTH)                                          ;\
+       movl    IS_HANDLERS(%ebp),%ebx                                  ;\
+       LOCK_KERNEL                                                     ;\
+6:                                                                     \
+       movl    IH_LEVEL(%ebx),%edi                                     ;\
+       cmpl    %esi,%edi                                               ;\
+       jle     7f                                                      ;\
+       pushl   %esp                                                    ;\
+       pushl   IH_ARG(%ebx)                                            ;\
+       movl    %edi,CPUVAR(ILEVEL)                                     ;\
+       call    *IH_FUN(%ebx)           /* call it */                   ;\
+       addl    $8,%esp                 /* toss the arg */              ;\
+       movl    IH_NEXT(%ebx),%ebx      /* next handler in chain */     ;\
+       testl   %ebx,%ebx                                               ;\
+       jnz     6b                                                      ;\
+5:                                                                     \
+       UNLOCK_KERNEL                                                   ;\
+       CLI(%eax)                                                       ;\
+       unmask(num)                     /* unmask it in hardware */     ;\
+       late_ack(num)                                                   ;\
+       STI(%eax)                                                       ;\
+       jmp     _C_LABEL(Xdoreti)       /* lower spl and do ASTs */     ;\
+7:                                                                     \
+       UNLOCK_KERNEL                                                   ;\
+       CLI(%eax)                                                       ;\
+       orl     $(1 << num),CPUVAR(IPENDING)                            ;\
+       level_mask(num)                                                 ;\
+       late_ack(num)                                                   ;\
+       STI(%eax)                                                       ;\
+       jmp     _C_LABEL(Xdoreti)       /* lower spl and do ASTs */     ;\
+10:                                                                    \
+       CLI(%eax)                                                       ;\
+       orl     $(1 << num),CPUVAR(IPENDING)                            ;\
+       level_mask(num)                                                 ;\
+6:                             ; \
+       late_ack(num)                                                   ;\
+       STIC(%eax)                                                      ;\
+       jz      4f              ; \
+       call    _C_LABEL(stipending) ; \
+       testl   %eax,%eax       ; \
+       jnz     1b              ; \
+4:     INTRFASTEXIT                                                    ;\
+9:                                                                     \
+       unmask(num)                                                     ;\
+       jmp     6b
+
+#define hypervisor_asm_unmask(num)                     \
+       movl    irq_to_evtchn + (num) * 4,%ecx          ;\
+       movl    HYPERVISOR_shared_info,%eax             ;\
+       lock                                            ;\
+       btrl    %ecx,EVENTS_MASK(%eax)
+
+XENINTRSTUB(xenev,0,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,1,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,2,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,3,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,4,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,5,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,6,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,7,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,8,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,9,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,10,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,11,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,12,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,13,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,14,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,15,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,16,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,17,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,18,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,19,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,20,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,21,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,22,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,23,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,24,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,25,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,26,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,27,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,28,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,29,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,30,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+XENINTRSTUB(xenev,31,voidop,voidop,voidop,hypervisor_asm_unmask,voidop)
+
+.globl _C_LABEL(xenev_stubs)
+_C_LABEL(xenev_stubs):
+       .long _C_LABEL(Xintr_xenev0), _C_LABEL(Xrecurse_xenev0)
+       .long _C_LABEL(Xresume_xenev0)
+       .long _C_LABEL(Xintr_xenev1), _C_LABEL(Xrecurse_xenev1)
+       .long _C_LABEL(Xresume_xenev1)
+       .long _C_LABEL(Xintr_xenev2), _C_LABEL(Xrecurse_xenev2)
+       .long _C_LABEL(Xresume_xenev2)
+       .long _C_LABEL(Xintr_xenev3), _C_LABEL(Xrecurse_xenev3)
+       .long _C_LABEL(Xresume_xenev3)
+       .long _C_LABEL(Xintr_xenev4), _C_LABEL(Xrecurse_xenev4)
+       .long _C_LABEL(Xresume_xenev4)
+       .long _C_LABEL(Xintr_xenev5), _C_LABEL(Xrecurse_xenev5)
+       .long _C_LABEL(Xresume_xenev5)
+       .long _C_LABEL(Xintr_xenev6), _C_LABEL(Xrecurse_xenev6)
+       .long _C_LABEL(Xresume_xenev6)
+       .long _C_LABEL(Xintr_xenev7), _C_LABEL(Xrecurse_xenev7)
+       .long _C_LABEL(Xresume_xenev7)
+       .long _C_LABEL(Xintr_xenev8), _C_LABEL(Xrecurse_xenev8)
+       .long _C_LABEL(Xresume_xenev8)
+       .long _C_LABEL(Xintr_xenev9), _C_LABEL(Xrecurse_xenev9)
+       .long _C_LABEL(Xresume_xenev9)
+       .long _C_LABEL(Xintr_xenev10), _C_LABEL(Xrecurse_xenev10)
+       .long _C_LABEL(Xresume_xenev10)
+       .long _C_LABEL(Xintr_xenev11), _C_LABEL(Xrecurse_xenev11)
+       .long _C_LABEL(Xresume_xenev11)
+       .long _C_LABEL(Xintr_xenev12), _C_LABEL(Xrecurse_xenev12)
+       .long _C_LABEL(Xresume_xenev12)
+       .long _C_LABEL(Xintr_xenev13), _C_LABEL(Xrecurse_xenev13)
+       .long _C_LABEL(Xresume_xenev13)
+       .long _C_LABEL(Xintr_xenev14), _C_LABEL(Xrecurse_xenev14)
+       .long _C_LABEL(Xresume_xenev14)
+       .long _C_LABEL(Xintr_xenev15), _C_LABEL(Xrecurse_xenev15)
+       .long _C_LABEL(Xresume_xenev15)
+       .long _C_LABEL(Xintr_xenev16), _C_LABEL(Xrecurse_xenev16)
+       .long _C_LABEL(Xresume_xenev16)
+       .long _C_LABEL(Xintr_xenev17), _C_LABEL(Xrecurse_xenev17)
+       .long _C_LABEL(Xresume_xenev17)
+       .long _C_LABEL(Xintr_xenev18), _C_LABEL(Xrecurse_xenev18)
+       .long _C_LABEL(Xresume_xenev18)
+       .long _C_LABEL(Xintr_xenev19), _C_LABEL(Xrecurse_xenev19)
+       .long _C_LABEL(Xresume_xenev19)
+       .long _C_LABEL(Xintr_xenev20), _C_LABEL(Xrecurse_xenev20)
+       .long _C_LABEL(Xresume_xenev20)
+       .long _C_LABEL(Xintr_xenev21), _C_LABEL(Xrecurse_xenev21)
+       .long _C_LABEL(Xresume_xenev21)
+       .long _C_LABEL(Xintr_xenev22), _C_LABEL(Xrecurse_xenev22)
+       .long _C_LABEL(Xresume_xenev22)
+       .long _C_LABEL(Xintr_xenev23), _C_LABEL(Xrecurse_xenev23)
+       .long _C_LABEL(Xresume_xenev23)
+       .long _C_LABEL(Xintr_xenev24), _C_LABEL(Xrecurse_xenev24)
+       .long _C_LABEL(Xresume_xenev24)
+       .long _C_LABEL(Xintr_xenev25), _C_LABEL(Xrecurse_xenev25)
+       .long _C_LABEL(Xresume_xenev25)
+       .long _C_LABEL(Xintr_xenev26), _C_LABEL(Xrecurse_xenev26)
+       .long _C_LABEL(Xresume_xenev26)
+       .long _C_LABEL(Xintr_xenev27), _C_LABEL(Xrecurse_xenev27)
+       .long _C_LABEL(Xresume_xenev27)
+       .long _C_LABEL(Xintr_xenev28), _C_LABEL(Xrecurse_xenev28)
+       .long _C_LABEL(Xresume_xenev28)
+       .long _C_LABEL(Xintr_xenev29), _C_LABEL(Xrecurse_xenev29)
+       .long _C_LABEL(Xresume_xenev29)
+       .long _C_LABEL(Xintr_xenev30), _C_LABEL(Xrecurse_xenev30)
+       .long _C_LABEL(Xresume_xenev30)
+       .long _C_LABEL(Xintr_xenev31), _C_LABEL(Xrecurse_xenev31)
+       .long _C_LABEL(Xresume_xenev31)
+
+#ifndef XEN
+/*
+ * This macro defines the generic stub code. Its arguments modifiy it
+ * for specific PICs.
+ */
+
+#define        INTRSTUB(name, num, early_ack, late_ack, mask, unmask, level_mask) \
+IDTVEC(recurse_/**/name/**/num)                                                ;\
+       pushfl                                                          ;\
+       pushl   %cs                                                     ;\
+       pushl   %esi                                                    ;\
+       subl    $4,%esp                                                 ;\
+       pushl   $T_ASTFLT               /* trap # for doing ASTs */     ;\
+       INTRENTRY                                                       ;\
+IDTVEC(resume_/**/name/**/num)                                         \
+       movl    $IREENT_MAGIC,TF_ERR(%esp)                              ;\
+       movl    %ebx,%esi                                               ;\
+       movl    CPUVAR(ISOURCES) + (num) * 4, %ebp                      ;\
+       movl    IS_MAXLEVEL(%ebp),%ebx                                  ;\
+       jmp     1f                                                      ;\
+IDTVEC(intr_/**/name/**/num)                                           ;\
+       pushl   $0                      /* dummy error code */          ;\
+       pushl   $T_ASTFLT               /* trap # for doing ASTs */     ;\
+       INTRENTRY                                                       ;\
+       movl    CPUVAR(ISOURCES) + (num) * 4, %ebp              ;\
+       mask(num)               /* mask it in hardware */       ;\
+       early_ack(num)                  /* and allow other intrs */     ;\
+       testl   %ebp,%ebp                                               ;\
+       jz      9f                      /* stray */                     ;\
+       movl    IS_MAXLEVEL(%ebp),%ebx                                  ;\
+       movl    CPUVAR(ILEVEL),%esi                                     ;\
+       cmpl    %ebx,%esi                                               ;\
+       jae     10f                     /* currently masked; hold it */ ;\
+       incl    MY_COUNT+V_INTR         /* statistical info */          ;\
+       addl    $1,IS_EVCNTLO(%ebp)     /* inc event counter */         ;\
+       adcl    $0,IS_EVCNTHI(%ebp)                                     ;\
+1:                                                                     \
+       pushl   %esi                                                    ;\
+       movl    %ebx,CPUVAR(ILEVEL)                                     ;\
+       STI(%eax)                                                       ;\
+       incl    CPUVAR(IDEPTH)                                          ;\
+       movl    IS_HANDLERS(%ebp),%ebx                                  ;\
+       LOCK_KERNEL                                                     ;\
+6:                                                                     \
+       movl    IH_LEVEL(%ebx),%edi                                     ;\
+       cmpl    %esi,%edi                                               ;\
+       jle     7f                                                      ;\
+       pushl   IH_ARG(%ebx)                                            ;\
+       movl    %edi,CPUVAR(ILEVEL)                                     ;\
+       call    *IH_FUN(%ebx)           /* call it */                   ;\
+       addl    $4,%esp                 /* toss the arg */              ;\
+       movl    IH_NEXT(%ebx),%ebx      /* next handler in chain */     ;\
+       testl   %ebx,%ebx                                               ;\
+       jnz     6b                                                      ;\
+5:                                                                     \
+       UNLOCK_KERNEL                                                   ;\
+       CLI(%eax)                                                       ;\
+       unmask(num)                     /* unmask it in hardware */     ;\
+       late_ack(num)                                                   ;\
+       STI(%eax)                                                       ;\
+       jmp     _C_LABEL(Xdoreti)       /* lower spl and do ASTs */     ;\
+7:                                                                     \
+       UNLOCK_KERNEL                                                   ;\
+       CLI(%eax)                                                       ;\
+       orl     $(1 << num),CPUVAR(IPENDING)                            ;\
+       level_mask(num)                                                 ;\
+       late_ack(num)                                                   ;\
+       STI(%eax)                                                       ;\
+       jmp     _C_LABEL(Xdoreti)       /* lower spl and do ASTs */     ;\
+10:                                                                    \
+       CLI(%eax)                                                       ;\
+       orl     $(1 << num),CPUVAR(IPENDING)                            ;\
+       level_mask(num)                                                 ;\
+       late_ack(num)                                                   ;\
+       STIC(%eax)                                                      ;\
+       jz      4f              ; \
+       call    _C_LABEL(stipending) ; \
+       testl   %eax,%eax       ; \
+       jnz     1b              ; \
+4:     INTRFASTEXIT                                                    ;\
+9:                                                                     \
+       unmask(num)                                                     ;\
+       late_ack(num)                                                   ;\
+       STIC(%eax)                                                      ;\
+       jz      4f              ; \
+       call    _C_LABEL(stipending) ; \
+       testl   %eax,%eax       ; \
+       jnz     1b              ; \
+4:     INTRFASTEXIT
+
+#define ICUADDR IO_ICU1
+
+INTRSTUB(legacy,0,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,1,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,2,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,3,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,4,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,5,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,6,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,7,i8259_asm_ack1,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+#undef ICUADDR
+#define ICUADDR IO_ICU2
+
+INTRSTUB(legacy,8,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,9,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,10,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,11,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,12,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,13,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,14,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+INTRSTUB(legacy,15,i8259_asm_ack2,voidop,i8259_asm_mask,i8259_asm_unmask,
+    voidop)
+#endif
+
+#if NIOAPIC > 0
+
+INTRSTUB(ioapic_edge,0,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,1,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,2,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,3,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,4,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,5,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,6,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,7,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,8,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,9,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,10,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,11,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,12,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,13,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,14,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,15,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,16,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,17,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,18,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,19,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,20,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,21,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,22,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,23,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,24,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,25,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,26,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,27,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,28,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,29,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,30,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+INTRSTUB(ioapic_edge,31,voidop,ioapic_asm_ack,voidop,voidop,voidop)
+
+INTRSTUB(ioapic_level,0,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,1,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,2,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,3,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,4,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,5,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,6,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,7,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,8,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,9,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,10,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,11,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,12,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,13,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,14,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,15,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,16,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,17,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,18,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,19,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,20,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,21,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,22,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,23,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,24,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,25,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,26,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,27,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,28,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,29,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,30,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+INTRSTUB(ioapic_level,31,voidop,ioapic_asm_ack,voidop,ioapic_unmask,ioapic_mask)
+
+#endif
+
+#ifndef XEN
+.globl _C_LABEL(i8259_stubs)
+_C_LABEL(i8259_stubs):
+       .long _C_LABEL(Xintr_legacy0), _C_LABEL(Xrecurse_legacy0)
+       .long _C_LABEL(Xresume_legacy0)
+       .long _C_LABEL(Xintr_legacy1), _C_LABEL(Xrecurse_legacy1)
+       .long _C_LABEL(Xresume_legacy1)
+       .long _C_LABEL(Xintr_legacy2), _C_LABEL(Xrecurse_legacy2)
+       .long _C_LABEL(Xresume_legacy2)
+       .long _C_LABEL(Xintr_legacy3), _C_LABEL(Xrecurse_legacy3)
+       .long _C_LABEL(Xresume_legacy3)
+       .long _C_LABEL(Xintr_legacy4), _C_LABEL(Xrecurse_legacy4)
+       .long _C_LABEL(Xresume_legacy4)
+       .long _C_LABEL(Xintr_legacy5), _C_LABEL(Xrecurse_legacy5)
+       .long _C_LABEL(Xresume_legacy5)
+       .long _C_LABEL(Xintr_legacy6), _C_LABEL(Xrecurse_legacy6)
+       .long _C_LABEL(Xresume_legacy6)
+       .long _C_LABEL(Xintr_legacy7), _C_LABEL(Xrecurse_legacy7)
+       .long _C_LABEL(Xresume_legacy7)
+       .long _C_LABEL(Xintr_legacy8), _C_LABEL(Xrecurse_legacy8)
+       .long _C_LABEL(Xresume_legacy8)
+       .long _C_LABEL(Xintr_legacy9), _C_LABEL(Xrecurse_legacy9)
+       .long _C_LABEL(Xresume_legacy9)
+       .long _C_LABEL(Xintr_legacy10), _C_LABEL(Xrecurse_legacy10)
+       .long _C_LABEL(Xresume_legacy10)
+       .long _C_LABEL(Xintr_legacy11), _C_LABEL(Xrecurse_legacy11)
+       .long _C_LABEL(Xresume_legacy11)
+       .long _C_LABEL(Xintr_legacy12), _C_LABEL(Xrecurse_legacy12)
+       .long _C_LABEL(Xresume_legacy12)
+       .long _C_LABEL(Xintr_legacy13), _C_LABEL(Xrecurse_legacy13)
+       .long _C_LABEL(Xresume_legacy13)
+       .long _C_LABEL(Xintr_legacy14), _C_LABEL(Xrecurse_legacy14)
+       .long _C_LABEL(Xresume_legacy14)
+       .long _C_LABEL(Xintr_legacy15), _C_LABEL(Xrecurse_legacy15)
+       .long _C_LABEL(Xresume_legacy15)
+#endif
+
+#if NIOAPIC > 0
+.globl _C_LABEL(ioapic_edge_stubs)
+_C_LABEL(ioapic_edge_stubs):
+       .long _C_LABEL(Xintr_ioapic_edge0), _C_LABEL(Xrecurse_ioapic_edge0)
+       .long _C_LABEL(Xresume_ioapic_edge0)
+       .long _C_LABEL(Xintr_ioapic_edge1), _C_LABEL(Xrecurse_ioapic_edge1)
+       .long _C_LABEL(Xresume_ioapic_edge1)
+       .long _C_LABEL(Xintr_ioapic_edge2), _C_LABEL(Xrecurse_ioapic_edge2)
+       .long _C_LABEL(Xresume_ioapic_edge2)
+       .long _C_LABEL(Xintr_ioapic_edge3), _C_LABEL(Xrecurse_ioapic_edge3)
+       .long _C_LABEL(Xresume_ioapic_edge3)
+       .long _C_LABEL(Xintr_ioapic_edge4), _C_LABEL(Xrecurse_ioapic_edge4)
+       .long _C_LABEL(Xresume_ioapic_edge4)
+       .long _C_LABEL(Xintr_ioapic_edge5), _C_LABEL(Xrecurse_ioapic_edge5)
+       .long _C_LABEL(Xresume_ioapic_edge5)
+       .long _C_LABEL(Xintr_ioapic_edge6), _C_LABEL(Xrecurse_ioapic_edge6)
+       .long _C_LABEL(Xresume_ioapic_edge6)
+       .long _C_LABEL(Xintr_ioapic_edge7), _C_LABEL(Xrecurse_ioapic_edge7)
+       .long _C_LABEL(Xresume_ioapic_edge7)
+       .long _C_LABEL(Xintr_ioapic_edge8), _C_LABEL(Xrecurse_ioapic_edge8)
+       .long _C_LABEL(Xresume_ioapic_edge8)
+       .long _C_LABEL(Xintr_ioapic_edge9), _C_LABEL(Xrecurse_ioapic_edge9)
+       .long _C_LABEL(Xresume_ioapic_edge9)
+       .long _C_LABEL(Xintr_ioapic_edge10), _C_LABEL(Xrecurse_ioapic_edge10)
+       .long _C_LABEL(Xresume_ioapic_edge10)
+       .long _C_LABEL(Xintr_ioapic_edge11), _C_LABEL(Xrecurse_ioapic_edge11)
+       .long _C_LABEL(Xresume_ioapic_edge11)
+       .long _C_LABEL(Xintr_ioapic_edge12), _C_LABEL(Xrecurse_ioapic_edge12)
+       .long _C_LABEL(Xresume_ioapic_edge12)
+       .long _C_LABEL(Xintr_ioapic_edge13), _C_LABEL(Xrecurse_ioapic_edge13)
+       .long _C_LABEL(Xresume_ioapic_edge13)
+       .long _C_LABEL(Xintr_ioapic_edge14), _C_LABEL(Xrecurse_ioapic_edge14)
+       .long _C_LABEL(Xresume_ioapic_edge14)
+       .long _C_LABEL(Xintr_ioapic_edge15), _C_LABEL(Xrecurse_ioapic_edge15)
+       .long _C_LABEL(Xresume_ioapic_edge15)
+       .long _C_LABEL(Xintr_ioapic_edge16), _C_LABEL(Xrecurse_ioapic_edge16)
+       .long _C_LABEL(Xresume_ioapic_edge16)
+       .long _C_LABEL(Xintr_ioapic_edge17), _C_LABEL(Xrecurse_ioapic_edge17)
+       .long _C_LABEL(Xresume_ioapic_edge17)
+       .long _C_LABEL(Xintr_ioapic_edge18), _C_LABEL(Xrecurse_ioapic_edge18)
+       .long _C_LABEL(Xresume_ioapic_edge18)
+       .long _C_LABEL(Xintr_ioapic_edge19), _C_LABEL(Xrecurse_ioapic_edge19)
+       .long _C_LABEL(Xresume_ioapic_edge19)
+       .long _C_LABEL(Xintr_ioapic_edge20), _C_LABEL(Xrecurse_ioapic_edge20)
+       .long _C_LABEL(Xresume_ioapic_edge20)
+       .long _C_LABEL(Xintr_ioapic_edge21), _C_LABEL(Xrecurse_ioapic_edge21)
+       .long _C_LABEL(Xresume_ioapic_edge21)
+       .long _C_LABEL(Xintr_ioapic_edge22), _C_LABEL(Xrecurse_ioapic_edge22)
+       .long _C_LABEL(Xresume_ioapic_edge22)
+       .long _C_LABEL(Xintr_ioapic_edge23), _C_LABEL(Xrecurse_ioapic_edge23)
+       .long _C_LABEL(Xresume_ioapic_edge23)
+       .long _C_LABEL(Xintr_ioapic_edge24), _C_LABEL(Xrecurse_ioapic_edge24)
+       .long _C_LABEL(Xresume_ioapic_edge24)
+       .long _C_LABEL(Xintr_ioapic_edge25), _C_LABEL(Xrecurse_ioapic_edge25)
+       .long _C_LABEL(Xresume_ioapic_edge25)
+       .long _C_LABEL(Xintr_ioapic_edge26), _C_LABEL(Xrecurse_ioapic_edge26)
+       .long _C_LABEL(Xresume_ioapic_edge26)
+       .long _C_LABEL(Xintr_ioapic_edge27), _C_LABEL(Xrecurse_ioapic_edge27)
+       .long _C_LABEL(Xresume_ioapic_edge27)
+       .long _C_LABEL(Xintr_ioapic_edge28), _C_LABEL(Xrecurse_ioapic_edge28)
+       .long _C_LABEL(Xresume_ioapic_edge28)
+       .long _C_LABEL(Xintr_ioapic_edge29), _C_LABEL(Xrecurse_ioapic_edge29)
+       .long _C_LABEL(Xresume_ioapic_edge29)
+       .long _C_LABEL(Xintr_ioapic_edge30), _C_LABEL(Xrecurse_ioapic_edge30)
+       .long _C_LABEL(Xresume_ioapic_edge30)
+       .long _C_LABEL(Xintr_ioapic_edge31), _C_LABEL(Xrecurse_ioapic_edge31)
+       .long _C_LABEL(Xresume_ioapic_edge31)
+
+.globl _C_LABEL(ioapic_level_stubs)
+_C_LABEL(ioapic_level_stubs):
+       .long _C_LABEL(Xintr_ioapic_level0), _C_LABEL(Xrecurse_ioapic_level0)
+       .long _C_LABEL(Xresume_ioapic_level0)
+       .long _C_LABEL(Xintr_ioapic_level1), _C_LABEL(Xrecurse_ioapic_level1)
+       .long _C_LABEL(Xresume_ioapic_level1)
+       .long _C_LABEL(Xintr_ioapic_level2), _C_LABEL(Xrecurse_ioapic_level2)
+       .long _C_LABEL(Xresume_ioapic_level2)
+       .long _C_LABEL(Xintr_ioapic_level3), _C_LABEL(Xrecurse_ioapic_level3)
+       .long _C_LABEL(Xresume_ioapic_level3)
+       .long _C_LABEL(Xintr_ioapic_level4), _C_LABEL(Xrecurse_ioapic_level4)
+       .long _C_LABEL(Xresume_ioapic_level4)
+       .long _C_LABEL(Xintr_ioapic_level5), _C_LABEL(Xrecurse_ioapic_level5)
+       .long _C_LABEL(Xresume_ioapic_level5)
+       .long _C_LABEL(Xintr_ioapic_level6), _C_LABEL(Xrecurse_ioapic_level6)
+       .long _C_LABEL(Xresume_ioapic_level6)
+       .long _C_LABEL(Xintr_ioapic_level7), _C_LABEL(Xrecurse_ioapic_level7)
+       .long _C_LABEL(Xresume_ioapic_level7)
+       .long _C_LABEL(Xintr_ioapic_level8), _C_LABEL(Xrecurse_ioapic_level8)
+       .long _C_LABEL(Xresume_ioapic_level8)
+       .long _C_LABEL(Xintr_ioapic_level9), _C_LABEL(Xrecurse_ioapic_level9)
+       .long _C_LABEL(Xresume_ioapic_level9)
+       .long _C_LABEL(Xintr_ioapic_level10), _C_LABEL(Xrecurse_ioapic_level10)
+       .long _C_LABEL(Xresume_ioapic_level10)
+       .long _C_LABEL(Xintr_ioapic_level11), _C_LABEL(Xrecurse_ioapic_level11)
+       .long _C_LABEL(Xresume_ioapic_level11)
+       .long _C_LABEL(Xintr_ioapic_level12), _C_LABEL(Xrecurse_ioapic_level12)
+       .long _C_LABEL(Xresume_ioapic_level12)
+       .long _C_LABEL(Xintr_ioapic_level13), _C_LABEL(Xrecurse_ioapic_level13)
+       .long _C_LABEL(Xresume_ioapic_level13)
+       .long _C_LABEL(Xintr_ioapic_level14), _C_LABEL(Xrecurse_ioapic_level14)
+       .long _C_LABEL(Xresume_ioapic_level14)
+       .long _C_LABEL(Xintr_ioapic_level15), _C_LABEL(Xrecurse_ioapic_level15)
+       .long _C_LABEL(Xresume_ioapic_level15)
+       .long _C_LABEL(Xintr_ioapic_level16), _C_LABEL(Xrecurse_ioapic_level16)
+       .long _C_LABEL(Xresume_ioapic_level16)
+       .long _C_LABEL(Xintr_ioapic_level17), _C_LABEL(Xrecurse_ioapic_level17)
+       .long _C_LABEL(Xresume_ioapic_level17)
+       .long _C_LABEL(Xintr_ioapic_level18), _C_LABEL(Xrecurse_ioapic_level18)
+       .long _C_LABEL(Xresume_ioapic_level18)
+       .long _C_LABEL(Xintr_ioapic_level19), _C_LABEL(Xrecurse_ioapic_level19)
+       .long _C_LABEL(Xresume_ioapic_level19)
+       .long _C_LABEL(Xintr_ioapic_level20), _C_LABEL(Xrecurse_ioapic_level20)
+       .long _C_LABEL(Xresume_ioapic_level20)
+       .long _C_LABEL(Xintr_ioapic_level21), _C_LABEL(Xrecurse_ioapic_level21)
+       .long _C_LABEL(Xresume_ioapic_level21)
+       .long _C_LABEL(Xintr_ioapic_level22), _C_LABEL(Xrecurse_ioapic_level22)
+       .long _C_LABEL(Xresume_ioapic_level22)
+       .long _C_LABEL(Xintr_ioapic_level23), _C_LABEL(Xrecurse_ioapic_level23)
+       .long _C_LABEL(Xresume_ioapic_level23)
+       .long _C_LABEL(Xintr_ioapic_level24), _C_LABEL(Xrecurse_ioapic_level24)
+       .long _C_LABEL(Xresume_ioapic_level24)
+       .long _C_LABEL(Xintr_ioapic_level25), _C_LABEL(Xrecurse_ioapic_level25)
+       .long _C_LABEL(Xresume_ioapic_level25)
+       .long _C_LABEL(Xintr_ioapic_level26), _C_LABEL(Xrecurse_ioapic_level26)
+       .long _C_LABEL(Xresume_ioapic_level26)
+       .long _C_LABEL(Xintr_ioapic_level27), _C_LABEL(Xrecurse_ioapic_level27)
+       .long _C_LABEL(Xresume_ioapic_level27)
+       .long _C_LABEL(Xintr_ioapic_level28), _C_LABEL(Xrecurse_ioapic_level28)
+       .long _C_LABEL(Xresume_ioapic_level28)
+       .long _C_LABEL(Xintr_ioapic_level29), _C_LABEL(Xrecurse_ioapic_level29)
+       .long _C_LABEL(Xresume_ioapic_level29)
+       .long _C_LABEL(Xintr_ioapic_level30), _C_LABEL(Xrecurse_ioapic_level30)
+       .long _C_LABEL(Xresume_ioapic_level30)
+       .long _C_LABEL(Xintr_ioapic_level31), _C_LABEL(Xrecurse_ioapic_level31)
+       .long _C_LABEL(Xresume_ioapic_level31)
+#endif
+
+/*
+ * Symbols that vmstat -i wants, even though they're not used.
+ */
+.globl _C_LABEL(intrnames)
+_C_LABEL(intrnames):
+.globl _C_LABEL(eintrnames)
+_C_LABEL(eintrnames):
+
+.globl _C_LABEL(intrcnt)
+_C_LABEL(intrcnt):
+.globl _C_LABEL(eintrcnt)
+_C_LABEL(eintrcnt):
+
+/*
+ * Soft interrupt handlers
+ */
+
+IDTVEC(softserial)
+       movl    $IPL_SOFTSERIAL, CPUVAR(ILEVEL)
+       incl    CPUVAR(IDEPTH)
+#ifdef MULTIPROCESSOR
+       call    _C_LABEL(x86_softintlock)
+#endif
+       movl    CPUVAR(ISOURCES) + SIR_SERIAL * 4, %edi
+       addl    $1,IS_EVCNTLO(%edi)
+       adcl    $0,IS_EVCNTHI(%edi)
+       pushl   $X86_SOFTINTR_SOFTSERIAL
+       call    _C_LABEL(softintr_dispatch)
+       addl    $4,%esp
+#ifdef MULTIPROCESSOR
+       call    _C_LABEL(x86_softintunlock)
+#endif
+       decl    CPUVAR(IDEPTH)
+       jmp     *%esi
+
+IDTVEC(softnet)
+       movl    $IPL_SOFTNET, CPUVAR(ILEVEL)
+       incl    CPUVAR(IDEPTH)
+#ifdef MULTIPROCESSOR
+       call    _C_LABEL(x86_softintlock)
+#endif
+       movl    CPUVAR(ISOURCES) + SIR_NET * 4, %edi
+       addl    $1,IS_EVCNTLO(%edi)
+       adcl    $0,IS_EVCNTHI(%edi)
+
+       xorl    %edi,%edi
+       xchgl   _C_LABEL(netisr),%edi
+
+       /* XXX Do the legacy netisrs here for now. */
+#define DONETISR(s, c) \
+       .globl  _C_LABEL(c)     ;\
+       testl   $(1 << s),%edi  ;\
+       jz      1f              ;\
+       call    _C_LABEL(c)     ;\
+1:
+#include <net/netisr_dispatch.h>
+
+       pushl   $X86_SOFTINTR_SOFTNET
+       call    _C_LABEL(softintr_dispatch)
+       addl    $4,%esp
+#ifdef MULTIPROCESSOR
+       call    _C_LABEL(x86_softintunlock)
+#endif
+       decl    CPUVAR(IDEPTH)
+       jmp     *%esi
+
+IDTVEC(softclock)
+       movl    $IPL_SOFTCLOCK, CPUVAR(ILEVEL)
+       incl    CPUVAR(IDEPTH)
+#ifdef MULTIPROCESSOR
+       call    _C_LABEL(x86_softintlock)
+#endif
+       movl    CPUVAR(ISOURCES) + SIR_CLOCK * 4, %edi
+       addl    $1,IS_EVCNTLO(%edi)
+       adcl    $0,IS_EVCNTHI(%edi)
+
+       pushl   $X86_SOFTINTR_SOFTCLOCK
+       call    _C_LABEL(softintr_dispatch)
+       addl    $4,%esp
+#ifdef MULTIPROCESSOR
+       call    _C_LABEL(x86_softintunlock)
+#endif
+       decl    CPUVAR(IDEPTH)
+       jmp     *%esi
+
+/*
+ * Trap and fault vector routines
+ *
+ * On exit from the kernel to user mode, we always need to check for ASTs.  In
+ * addition, we need to do this atomically; otherwise an interrupt may occur
+ * which causes an AST, but it won't get processed until the next kernel entry
+ * (possibly the next clock tick).  Thus, we disable interrupt before checking,
+ * and only enable them again on the final `iret' or before calling the AST
+ * handler.
+ */
+
+#define TRAP(a)                        pushl $(a) ; jmp _C_LABEL(alltraps)
+#define ZTRAP(a)               pushl $0 ; TRAP(a)
+
+#ifdef IPKDB
+#define BPTTRAP(a)     pushl $0; pushl $(a); jmp _C_LABEL(bpttraps)
+#else
+#define BPTTRAP(a)     ZTRAP(a)
+#endif
+
+
+       .text
+IDTVEC(trap00)
+       ZTRAP(T_DIVIDE)
+IDTVEC(trap01)
+       BPTTRAP(T_TRCTRAP)
+IDTVEC(trap02)
+       ZTRAP(T_NMI)
+IDTVEC(trap03)
+       BPTTRAP(T_BPTFLT)
+IDTVEC(trap04)
+       ZTRAP(T_OFLOW)
+IDTVEC(trap05)
+       ZTRAP(T_BOUND)
+IDTVEC(trap06)
+       ZTRAP(T_PRIVINFLT)
+IDTVEC(trap07)
+#if NNPX > 0
+       pushl   $0                      # dummy error code
+       pushl   $T_DNA
+       INTRENTRY
+#ifdef XENDEBUG_LOW
+       pushl   %esp
+#endif
+       pushl   CPUVAR(SELF)
+       call    *_C_LABEL(npxdna_func)
+       addl    $4,%esp
+#ifdef XENDEBUG_LOW
+       addl    $4,%esp
+#endif
+       testl   %eax,%eax
+       jz      calltrap
+       INTRFASTEXIT
+#else
+       ZTRAP(T_DNA)
+#endif
+IDTVEC(trap08)
+       TRAP(T_DOUBLEFLT)
+IDTVEC(trap09)
+       ZTRAP(T_FPOPFLT)
+IDTVEC(trap0a)
+       TRAP(T_TSSFLT)
+IDTVEC(trap0b)
+       TRAP(T_SEGNPFLT)
+IDTVEC(trap0c)
+       TRAP(T_STKFLT)
+IDTVEC(trap0d)
+       TRAP(T_PROTFLT)
+#ifndef XEN
+IDTVEC(trap0e)
+#ifndef I586_CPU
+       TRAP(T_PAGEFLT)
+#else
+       pushl   $T_PAGEFLT
+       INTRENTRY
+       testb   $PGEX_U,TF_ERR(%esp)
+       jnz     calltrap
+       movl    %cr2,%eax
+       subl    _C_LABEL(pentium_idt),%eax
+       cmpl    $(6*8),%eax
+       jne     calltrap
+       movb    $T_PRIVINFLT,TF_TRAPNO(%esp)
+       jmp     calltrap
+#endif
+#endif
+
+IDTVEC(intrspurious)
+IDTVEC(trap0f)
+       /*
+        * The Pentium Pro local APIC may erroneously call this vector for a
+        * default IR7.  Just ignore it.
+        *
+        * (The local APIC does this when CPL is raised while it's on the
+        * way to delivering an interrupt.. presumably enough has been set
+        * up that it's inconvenient to abort delivery completely..)
+        */
+       iret
+
+IDTVEC(trap10)
+#if NNPX > 0
+       /*
+        * Handle like an interrupt so that we can call npxintr to clear the
+        * error.  It would be better to handle npx interrupts as traps but
+        * this is difficult for nested interrupts.
+        */
+       pushl   $0                      # dummy error code
+       pushl   $T_ASTFLT
+       INTRENTRY
+       pushl   CPUVAR(ILEVEL)
+       pushl   %esp
+       incl    _C_LABEL(uvmexp)+V_TRAP
+       call    _C_LABEL(npxintr)
+       addl    $8,%esp
+       INTRFASTEXIT
+#else
+       ZTRAP(T_ARITHTRAP)
+#endif
+IDTVEC(trap11)
+       TRAP(T_ALIGNFLT)
+IDTVEC(trap12)
+IDTVEC(trap13)
+IDTVEC(trap14)
+IDTVEC(trap15)
+IDTVEC(trap16)
+IDTVEC(trap17)
+IDTVEC(trap18)
+IDTVEC(trap19)
+IDTVEC(trap1a)
+IDTVEC(trap1b)
+IDTVEC(trap1c)
+IDTVEC(trap1d)
+IDTVEC(trap1e)
+IDTVEC(trap1f)
+       /* 18 - 31 reserved for future exp */
+       ZTRAP(T_RESERVED)
+
+IDTVEC(exceptions)
+#ifndef XENDEBUG_LOW
+       .long   _C_LABEL(Xtrap00), _C_LABEL(Xtrap01)
+       .long   _C_LABEL(Xtrap02), _C_LABEL(Xtrap03)
+       .long   _C_LABEL(Xtrap04), _C_LABEL(Xtrap05)
+       .long   _C_LABEL(Xtrap06), _C_LABEL(Xtrap07)
+       .long   _C_LABEL(Xtrap08), _C_LABEL(Xtrap09)
+       .long   _C_LABEL(Xtrap0a), _C_LABEL(Xtrap0b)
+       .long   _C_LABEL(Xtrap0c), _C_LABEL(Xtrap0d)
+       .long   _C_LABEL(Xtrap0e), _C_LABEL(Xtrap0f)
+       .long   _C_LABEL(Xtrap10), _C_LABEL(Xtrap11)
+       .long   _C_LABEL(Xtrap12), _C_LABEL(Xtrap13)
+       .long   _C_LABEL(Xtrap14), _C_LABEL(Xtrap15)
+       .long   _C_LABEL(Xtrap16), _C_LABEL(Xtrap17)
+       .long   _C_LABEL(Xtrap18), _C_LABEL(Xtrap19)
+       .long   _C_LABEL(Xtrap1a), _C_LABEL(Xtrap1b)
+       .long   _C_LABEL(Xtrap1c), _C_LABEL(Xtrap1d)
+       .long   _C_LABEL(Xtrap1e), _C_LABEL(Xtrap1f)
+#else
+       .long   _C_LABEL(divide_error), _C_LABEL(debug)
+       .long   _C_LABEL(Xtrap02), _C_LABEL(Xtrap03)        #int3)
+       .long   _C_LABEL(overflow), _C_LABEL(bounds)
+       .long   _C_LABEL(invalid_op), _C_LABEL(device_not_available)
+       .long   _C_LABEL(double_fault), _C_LABEL(coprocessor_segment_overrun)
+       .long   _C_LABEL(invalid_TSS), _C_LABEL(segment_not_present)
+       .long   _C_LABEL(stack_segment)
+       #.long  _C_LABEL(general_protection)
+        .long  _C_LABEL(Xtrap0d)
+       #.long  _C_LABEL(page_fault)
+        .long  _C_LABEL(Xtrap0e)
+       .long   _C_LABEL(spurious_interrupt_bug)
+       .long   _C_LABEL(coprocessor_error), _C_LABEL(alignment_check)
+       .long   _C_LABEL(machine_check), _C_LABEL(simd_coprocessor_error)
+       .long   _C_LABEL(Xtrap14), _C_LABEL(Xtrap15)
+       .long   _C_LABEL(Xtrap16), _C_LABEL(Xtrap17)
+       .long   _C_LABEL(Xtrap18), _C_LABEL(Xtrap19)
+       .long   _C_LABEL(Xtrap1a), _C_LABEL(Xtrap1b)
+       .long   _C_LABEL(Xtrap1c), _C_LABEL(Xtrap1d)
+       .long   _C_LABEL(Xtrap1e), _C_LABEL(Xtrap1f)
+#endif
+
+
+IDTVEC(tss_trap08)
+1:
+       str     %ax
+       GET_TSS
+       movzwl  (%eax),%eax
+       GET_TSS
+       pushl   $T_DOUBLEFLT
+       pushl   %eax
+       call    _C_LABEL(trap_tss)
+       addl    $12,%esp
+       iret
+       jmp     1b
+
+/* LINTSTUB: Ignore */
+NENTRY(alltraps)
+       INTRENTRY
+calltrap:
+#ifdef DIAGNOSTIC
+       movl    CPUVAR(ILEVEL),%ebx
+#endif /* DIAGNOSTIC */
+       pushl   %esp
+       call    _C_LABEL(trap)
+       addl    $4,%esp
+       testb   $CHK_UPL,TF_CS(%esp)
+       jnz     alltraps_checkast
+#ifdef VM86
+       testl   $PSL_VM,TF_EFLAGS(%esp)
+       jz      6f
+#else
+       jmp     6f
+#endif
+alltraps_checkast:
+       /* Check for ASTs on exit to user mode. */
+       CLI(%eax)
+       CHECK_ASTPENDING(%eax)
+       jz      3f
+5:     CLEAR_ASTPENDING(%eax)
+       STI(%eax)
+       movl    $T_ASTFLT,TF_TRAPNO(%esp)
+       pushl   %esp
+       call    _C_LABEL(trap)
+       addl    $4,%esp
+       jmp     alltraps_checkast       /* re-check ASTs */
+3:     CHECK_DEFERRED_SWITCH(%eax)
+       jnz     9f
+6:     STIC(%eax)
+       jz      4f
+       call    _C_LABEL(stipending)
+       #testl  %eax,%eax               /* XXXcl */
+       #jnz    1b
+4:
+#ifndef DIAGNOSTIC
+       INTRFASTEXIT
+#else
+       cmpl    CPUVAR(ILEVEL),%ebx
+       jne     3f
+       INTRFASTEXIT
+3:     pushl   $4f
+       call    _C_LABEL(printf)
+       addl    $4,%esp
+#ifdef DDB
+       int     $3
+#endif /* DDB */
+       movl    %ebx,CPUVAR(ILEVEL)
+       jmp     alltraps_checkast       /* re-check ASTs */
+4:     .asciz  "WARNING: SPL NOT LOWERED ON TRAP EXIT\n"
+#endif /* DIAGNOSTIC */
+9:     STI(%eax)
+       call    _C_LABEL(pmap_load)
+       jmp     alltraps_checkast       /* re-check ASTs */
+
+/* LINTSTUB: Ignore */
+IDTVEC(trap0e)
+       INTRENTRY
+       movl    TF_TRAPNO(%esp),%eax
+       movl    $T_PAGEFLT,TF_TRAPNO(%esp)
+#ifdef DIAGNOSTIC
+       movl    CPUVAR(ILEVEL),%ebx
+#endif /* DIAGNOSTIC */
+       #pushl  %esp
+       pushl   %eax
+       movl    %esp,%eax
+       addl    $4,%eax
+       pushl   %eax
+       call    _C_LABEL(trap)
+       addl    $4,%esp
+       addl    $4,%esp
+       testb   $CHK_UPL,TF_CS(%esp)
+       jnz     trap0e_checkast
+#ifdef VM86
+       testl   $PSL_VM,TF_EFLAGS(%esp)
+       jz      6f
+#else
+       jmp     6f
+#endif
+trap0e_checkast:
+       /* Check for ASTs on exit to user mode. */
+       CLI(%eax)
+       CHECK_ASTPENDING(%eax)
+       jz      3f
+5:     CLEAR_ASTPENDING(%eax)
+       STI(%eax)
+       movl    $T_ASTFLT,TF_TRAPNO(%esp)
+       pushl   %esp
+       call    _C_LABEL(trap)
+       addl    $4,%esp
+       jmp     trap0e_checkast         /* re-check ASTs */
+3:     CHECK_DEFERRED_SWITCH(%eax)
+       jnz     9f
+6:     STIC(%eax)
+       jz      4f
+       call    _C_LABEL(stipending)
+       #testl  %eax,%eax               /* XXXcl */
+       #jnz    1b
+4:
+#ifndef DIAGNOSTIC
+       INTRFASTEXIT
+#else
+       cmpl    CPUVAR(ILEVEL),%ebx
+       jne     3f
+       INTRFASTEXIT
+3:     pushl   $4f
+       call    _C_LABEL(printf)
+       addl    $4,%esp
+#ifdef DDB
+       int     $3
+#endif /* DDB */
+       movl    %ebx,CPUVAR(ILEVEL)
+       jmp     trap0e_checkast         /* re-check ASTs */
+4:     .asciz  "WARNING: SPL NOT LOWERED ON TRAP EXIT\n"
+#endif /* DIAGNOSTIC */
+9:     STI(%eax)
+       call    _C_LABEL(pmap_load)
+       jmp     trap0e_checkast         /* re-check ASTs */
+
+#ifdef IPKDB
+/* LINTSTUB: Ignore */
+NENTRY(bpttraps)
+       INTRENTRY
+       call    _C_LABEL(ipkdb_trap_glue)
+       testl   %eax,%eax
+       jz      calltrap
+       INTRFASTEXIT
+
+ipkdbsetup:
+       popl    %ecx
+
+       /* Disable write protection: */
+       movl    %cr0,%eax
+       pushl   %eax
+       andl    $~CR0_WP,%eax
+       movl    %eax,%cr0
+
+       /* Substitute Protection & Page Fault handlers: */
+       movl    _C_LABEL(idt),%edx
+       pushl   13*8(%edx)
+       pushl   13*8+4(%edx)
+       pushl   14*8(%edx)
+       pushl   14*8+4(%edx)
+       movl    $fault,%eax
+       movw    %ax,13*8(%edx)
+       movw    %ax,14*8(%edx)
+       shrl    $16,%eax
+       movw    %ax,13*8+6(%edx)
+       movw    %ax,14*8+6(%edx)
+
+       pushl   %ecx
+       ret
+
+ipkdbrestore:
+       popl    %ecx
+
+       /* Restore Protection & Page Fault handlers: */
+       movl    _C_LABEL(idt),%edx
+       popl    14*8+4(%edx)
+       popl    14*8(%edx)
+       popl    13*8+4(%edx)
+       popl    13*8(%edx)
+
+       /* Restore write protection: */
+       popl    %edx
+       movl    %edx,%cr0
+
+       pushl   %ecx
+       ret
+#endif /* IPKDB */
+
+
+/*
+ * If an error is detected during trap, syscall, or interrupt exit, trap() will
+ * change %eip to point to one of these labels.  We clean up the stack, if
+ * necessary, and resume as if we were handling a general protection fault.
+ * This will cause the process to get a SIGBUS.
+ */
+/* LINTSTUB: Var: char resume_iret[1]; */
+NENTRY(resume_iret)
+       ZTRAP(T_PROTFLT)
+/* LINTSTUB: Var: char resume_pop_ds[1]; */
+NENTRY(resume_pop_ds)
+       movl    %es,TF_ES(%esp)
+       movl    $GSEL(GDATA_SEL, SEL_KPL),%eax
+       movw    %ax,%es
+/* LINTSTUB: Var: char resume_pop_es[1]; */
+NENTRY(resume_pop_es)
+       movl    %fs,TF_FS(%esp)
+       movl    $GSEL(GDATA_SEL, SEL_KPL),%eax
+       movw    %ax,%fs
+/* LINTSTUB: Var: char resume_pop_fs[1]; */
+NENTRY(resume_pop_fs)
+       movl    %gs,TF_GS(%esp)
+       movl    $GSEL(GDATA_SEL, SEL_KPL),%eax
+       movw    %ax,%gs
+/* LINTSTUB: Var: char resume_pop_gs[1]; */
+NENTRY(resume_pop_gs)
+       movl    $T_PROTFLT,TF_TRAPNO(%esp)
+       jmp     calltrap
+
+#ifdef IPKDB
+/* LINTSTUB: Func: int ipkdbfbyte(u_char *c) */
+NENTRY(ipkdbfbyte)
+       pushl   %ebp
+       movl    %esp,%ebp
+       call    ipkdbsetup
+       movl    8(%ebp),%edx
+       movzbl  (%edx),%eax
+faultexit:
+       call    ipkdbrestore
+       popl    %ebp
+       ret
+
+/* LINTSTUB: Func: int ipkdbsbyte(u_char *c, int i) */
+NENTRY(ipkdbsbyte)
+       pushl   %ebp
+       movl    %esp,%ebp
+       call    ipkdbsetup
+       movl    8(%ebp),%edx
+       movl    12(%ebp),%eax
+       movb    %al,(%edx)
+       call    ipkdbrestore
+       popl    %ebp
+       ret
+
+fault:
+       popl    %eax            /* error code */
+       movl    $faultexit,%eax
+       movl    %eax,(%esp)
+       movl    $-1,%eax
+       iret
+#endif /* IPKDB */
+
+
+
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until weve done all processing. HOWEVER, we must enable events before
+# popping the stack frame (cant be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so wed
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+ENTRY(hypervisor_callback)
+       pushl   $0                      # dummy error code
+       pushl   $T_ASTFLT
+       INTRENTRY
+        movl TF_EIP(%esp),%eax
+        cmpl $scrit,%eax
+        jb   11f
+        cmpl $ecrit,%eax
+        jb   critical_region_fixup
+11:     push %esp
+        call do_hypervisor_callback
+        add  $4,%esp
+        movl HYPERVISOR_shared_info,%esi
+        xorl %eax,%eax
+        movb TF_CS(%esp),%cl
+        test $CHK_UPL,%cl              # slow return to ring 2 or 3
+        je   safesti
+        movl CPUVAR(ILEVEL),%ebx
+        jmp  doreti_checkast
+safesti:XEN_UNBLOCK_EVENTS(%esi)       # reenable event callbacks
+scrit:  /**** START OF CRITICAL REGION ****/
+        testb $1,evtchn_upcall_pending(%esi)
+        jnz  14f                       # process more events if necessary...
+        INTRFASTEXIT
+critiret:
+14:     XEN_BLOCK_EVENTS(%esi)
+        jmp  11b
+ecrit:  /**** END OF CRITICAL REGION ****/
+# [How we do the fixup]. We want to merge the current stack frame with the
+# just-interrupted frame. How we do this depends on where in the critical
+# region the interrupted handler was executing, and so how many saved
+# registers are in each frame. We do this quickly using the lookup table
+# 'critical_fixup_table'. For each byte offset in the critical region, it
+# provides the number of bytes which have already been popped from the
+# interrupted stack frame.
+critical_region_fixup:
+        cmpl   $(critiret-1),%eax          # eip points to iret?
+       jne     1f
+       movl    $(TF_PUSHSIZE+0x8),%eax
+       jmp     2f
+1:     xorl    %eax,%eax
+2:
+                               # %eax contains num bytes popped
+        mov  %esp,%esi
+        add  %eax,%esi        # %esi points at end of src region
+        mov  %esp,%edi
+        add  $(TF_PUSHSIZE+0x8+0xC),%edi # %edi points at end of dst region
+        mov  %eax,%ecx
+        shr  $2,%ecx          # convert words to bytes
+        je   16f              # skip loop if nothing to copy
+15:     subl $4,%esi          # pre-decrementing copy loop
+        subl $4,%edi
+        movl (%esi),%eax
+        movl %eax,(%edi)
+        loop 15b
+16:     movl %edi,%esp        # final %edi is top of merged stack
+        jmp  11b
+
+
+# Hypervisor uses this for application faults while it executes.
+ENTRY(failsafe_callback)
+       pop     %ds
+       pop     %es
+       pop     %fs
+       pop     %gs
+       call    _C_LABEL(xen_failsafe_handler)
+       iret
+
+#ifdef XENDEBUG_LOW
+
+ES             = 0x20
+ORIG_EAX       = 0x24
+EIP            = 0x28
+CS             = 0x2C
+
+#define SAVE_ALL \
+       cld; \
+       pushl %es; \
+       pushl %ds; \
+       pushl %eax; \
+       pushl %ebp; \
+       pushl %edi; \
+       pushl %esi; \
+       pushl %edx; \
+       pushl %ecx; \
+       pushl %ebx; \
+       movl $GSEL(GDATA_SEL, SEL_KPL),%edx; \
+       movl %edx,%ds; \
+       movl %edx,%es;
+
+#define RESTORE_ALL    \
+       popl %ebx;      \
+       popl %ecx;      \
+       popl %edx;      \
+       popl %esi;      \
+       popl %edi;      \
+       popl %ebp;      \
+       popl %eax;      \
+       popl %ds;       \
+       popl %es;       \
+       addl $4,%esp;   \
+       iret;           \
+
+ret_from_exception:
+        movb CS(%esp),%cl
+       test $2,%cl          # slow return to ring 2 or 3
+       jne  safesti
+        RESTORE_ALL
+
+
+ENTRY(divide_error)
+       pushl $0                # no error code
+       pushl $do_divide_error
+do_exception:
+       pushl %ds
+       pushl %eax
+       xorl %eax,%eax
+       pushl %ebp
+       pushl %edi
+       pushl %esi
+       pushl %edx
+       decl %eax                       # eax = -1
+       pushl %ecx
+       pushl %ebx
+       cld
+       movl %es,%ecx
+       movl ORIG_EAX(%esp), %esi       # get the error code
+       movl ES(%esp), %edi             # get the function address
+       movl %eax, ORIG_EAX(%esp)
+       movl %ecx, ES(%esp)
+       movl %esp,%edx
+       pushl %esi                      # push the error code
+       pushl %edx                      # push the pt_regs pointer
+       movl $(__KERNEL_DS),%edx
+       movl %edx,%ds
+       movl %edx,%es
+       call *%edi
+       addl $8,%esp
+       jmp ret_from_exception
+
+ENTRY(coprocessor_error)
+       pushl $0
+       pushl $do_coprocessor_error
+       jmp do_exception
+
+ENTRY(simd_coprocessor_error)
+       pushl $0
+       pushl $do_simd_coprocessor_error
+       jmp do_exception
+
+ENTRY(device_not_available)
+        iret
+
+ENTRY(debug)
+       pushl $0
+       pushl $do_debug
+       jmp do_exception
+
+ENTRY(int3)
+       pushl $0
+       pushl $do_int3
+       jmp do_exception
+
+ENTRY(overflow)
+       pushl $0
+       pushl $do_overflow
+       jmp do_exception
+
+ENTRY(bounds)
+       pushl $0
+       pushl $do_bounds
+       jmp do_exception
+
+ENTRY(invalid_op)
+       pushl $0
+       pushl $do_invalid_op
+       jmp do_exception
+
+ENTRY(coprocessor_segment_overrun)
+       pushl $0
+       pushl $do_coprocessor_segment_overrun
+       jmp do_exception
+
+ENTRY(double_fault)
+       pushl $do_double_fault
+       jmp do_exception
+
+ENTRY(invalid_TSS)
+       pushl $do_invalid_TSS
+       jmp do_exception
+
+ENTRY(segment_not_present)
+       pushl $do_segment_not_present
+       jmp do_exception
+
+ENTRY(stack_segment)
+       pushl $do_stack_segment
+       jmp do_exception
+
+ENTRY(general_protection)
+       pushl $do_general_protection
+       jmp do_exception
+
+ENTRY(alignment_check)
+       pushl $do_alignment_check
+       jmp do_exception
+
+# This handler is special, because it gets an extra value on its stack,
+# which is the linear faulting address.
+ENTRY(page_fault)
+       pushl %ds
+       pushl %eax
+       xorl %eax,%eax
+       pushl %ebp
+       pushl %edi
+       pushl %esi
+       pushl %edx
+       decl %eax                       # eax = -1
+       pushl %ecx
+       pushl %ebx
+       cld
+       movl %es,%ecx
+       movl ORIG_EAX(%esp), %esi       # get the error code
+       movl ES(%esp), %edi             # get the faulting address
+       movl %eax, ORIG_EAX(%esp)
+       movl %ecx, ES(%esp)
+       movl %esp,%edx
+        pushl %edi                      # push the faulting address
+       pushl %esi                      # push the error code
+       pushl %edx                      # push the pt_regs pointer
+       movl $(__KERNEL_DS),%edx
+       movl %edx,%ds
+       movl %edx,%es
+       call do_page_fault
+       addl $12,%esp
+       jmp ret_from_exception
+
+ENTRY(machine_check)
+       pushl $0
+       pushl $do_machine_check
+       jmp do_exception
+
+ENTRY(spurious_interrupt_bug)
+       pushl $0
+       pushl $do_spurious_interrupt_bug
+       jmp do_exception
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c

new file mode 100644 (file)

index 0000000..d51baba
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c
@@ -0,0 +1,680 @@
+/*     $NetBSD: xen_machdep.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $   */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xen_machdep.c,v 1.1.2.1 2004/05/22 15:57:33 he Exp $");
+
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+
+#include <uvm/uvm.h>
+
+#include <machine/gdt.h>
+#include <machine/xenfunc.h>
+#include <machine/xenpmap.h>
+
+/* #define     XENDEBUG */
+/* #define     XENDEBUG_LOW */
+
+#ifdef XENDEBUG
+#define        XENPRINTF(x) printf x
+#define        XENPRINTK(x) printk x
+#define        XENPRINTK2(x) /* printk x */
+
+static char XBUF[256];
+#else
+#define        XENPRINTF(x)
+#define        XENPRINTK(x)
+#define        XENPRINTK2(x)
+#endif
+void printk(char *, ...);
+#define        PRINTF(x) printf x
+#define        PRINTK(x) printk x
+
+shared_info_t *HYPERVISOR_shared_info;
+union start_info_union start_info_union;
+
+void xen_failsafe_handler(void);
+
+void
+xen_failsafe_handler(void)
+{
+
+       panic("xen_failsafe_handler called!\n");
+}
+
+
+void
+xen_update_descriptor(union descriptor *table, union descriptor *entry)
+{
+       paddr_t pa;
+       pt_entry_t *ptp;
+
+       ptp = kvtopte((vaddr_t)table);
+       pa = (*ptp & PG_FRAME) | ((vaddr_t)table & ~PG_FRAME);
+       if (HYPERVISOR_update_descriptor(pa, entry->raw[0], entry->raw[1]))
+               panic("HYPERVISOR_update_descriptor failed\n");
+}
+
+void
+xen_set_ldt(vaddr_t base, uint32_t entries)
+{
+       vaddr_t va;
+       pt_entry_t *ptp, *maptp;
+
+       for (va = base; va < base + entries * sizeof(union descriptor);
+            va += PAGE_SIZE) {
+               KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
+               ptp = kvtopte(va);
+               maptp = (pt_entry_t *)vtomach((vaddr_t)ptp);
+               XENPRINTF(("xen_set_ldt %p %d %p %p\n", (void *)base,
+                             entries, ptp, maptp));
+               PTE_CLEARBITS(ptp, maptp, PG_RW);
+       }
+       PTE_UPDATES_FLUSH();
+
+       xpq_queue_set_ldt(base, entries);
+       xpq_flush_queue();
+}
+
+void
+lgdt(struct region_descriptor *rdp)
+{
+
+       panic("lgdt %p %08x\n", (void *)rdp->rd_base, rdp->rd_limit);
+}
+
+void
+xen_parse_cmdline(int what, union xen_cmdline_parseinfo *xcp)
+{
+       char *cmd_line, *opt, *s;
+       int b, i, ipidx = 0;
+       uint32_t xi_ip[5];
+
+       cmd_line = xen_start_info.cmd_line;
+
+       switch (what) {
+       case XEN_PARSE_BOOTDEV:
+               xcp->xcp_bootdev[0] = 0;
+               break;
+       case XEN_PARSE_CONSOLE:
+               xcp->xcp_console[0] = 0;
+               break;
+       }
+
+       while (cmd_line && *cmd_line) {
+               opt = cmd_line;
+               cmd_line = strchr(opt, ' ');
+               if (cmd_line)
+                       *cmd_line = 0;
+
+               switch (what) {
+               case XEN_PARSE_BOOTDEV:
+                       if (strncasecmp(opt, "bootdev=", 8) == 0)
+                               strncpy(xcp->xcp_bootdev, opt + 8,
+                                   sizeof(xcp->xcp_console));
+                       break;
+
+               case XEN_PARSE_NETINFO:
+                       if (xcp->xcp_netinfo.xi_root &&
+                           strncasecmp(opt, "nfsroot=", 8) == 0)
+                               strncpy(xcp->xcp_netinfo.xi_root, opt + 8,
+                                   MNAMELEN);
+
+                       if (strncasecmp(opt, "ip=", 3) == 0) {
+                               memset(xi_ip, 0, sizeof(xi_ip));
+                               opt += 3;
+                               ipidx = 0;
+                               while (opt && *opt) {
+                                       s = opt;
+                                       opt = strchr(opt, ':');
+                                       if (opt)
+                                               *opt = 0;
+
+                                       switch (ipidx) {
+                                       case 0: /* ip */
+                                       case 1: /* nfs server */
+                                       case 2: /* gw */
+                                       case 3: /* mask */
+                                       case 4: /* host */
+                                               if (*s == 0)
+                                                       break;
+                                               for (i = 0; i < 4; i++) {
+                                                       b = strtoul(s, &s, 10);
+                                                       xi_ip[ipidx] = b + 256
+                                                               * xi_ip[ipidx];
+                                                       if (*s != '.')
+                                                               break;
+                                                       s++;
+                                               }
+                                               if (i < 3)
+                                                       xi_ip[ipidx] = 0;
+                                               break;
+                                       case 5: /* interface */
+                                               if (!strncmp(s, "xennet", 6))
+                                                       s += 6;
+                                               else if (!strncmp(s, "eth", 3))
+                                                       s += 3;
+                                               else
+                                                       break;
+                                               if (xcp->xcp_netinfo.xi_ifno
+                                                   == strtoul(s, NULL, 10))
+                                                       memcpy(xcp->
+                                                           xcp_netinfo.xi_ip,
+                                                           xi_ip,
+                                                           sizeof(xi_ip));
+                                               break;
+                                       }
+                                       ipidx++;
+
+                                       if (opt)
+                                               *opt++ = ':';
+                               }
+                       }
+                       break;
+
+               case XEN_PARSE_CONSOLE:
+                       if (strncasecmp(opt, "console=", 8) == 0)
+                               strncpy(xcp->xcp_console, opt + 8,
+                                   sizeof(xcp->xcp_console));
+                       break;
+
+               }
+
+               if (cmd_line)
+                       *cmd_line++ = ' ';
+       }
+}
+
+
+
+
+
+#define XEN_PAGE_OFFSET 0xC0100000
+
+static pd_entry_t
+xpmap_get_bootpde(paddr_t va)
+{
+
+       return ((pd_entry_t *)xen_start_info.pt_base)[va >> PDSHIFT];
+}
+
+static pd_entry_t
+xpmap_get_vbootpde(paddr_t va)
+{
+       pd_entry_t pde;
+
+       pde = xpmap_get_bootpde(va);
+       if ((pde & PG_V) == 0)
+               return (pde & ~PG_FRAME);
+       return (pde & ~PG_FRAME) |
+               (xpmap_mtop(pde & PG_FRAME) + KERNBASE);
+}
+
+static pt_entry_t *
+xpmap_get_bootptep(paddr_t va)
+{
+       pd_entry_t pde;
+
+       pde = xpmap_get_vbootpde(va);
+       if ((pde & PG_V) == 0)
+               return (void *)-1;
+       return &(((pt_entry_t *)(pde & PG_FRAME))[(va & PT_MASK) >> PAGE_SHIFT]);
+}
+
+static pt_entry_t
+xpmap_get_bootpte(paddr_t va)
+{
+
+       return xpmap_get_bootptep(va)[0];
+}
+
+#if defined(XENDEBUG)
+static void
+xpmap_dump_pt(pt_entry_t *ptp, int p)
+{
+       pt_entry_t pte;
+       int j;
+       int bufpos;
+
+       pte = xpmap_ptom((uint32_t)ptp - KERNBASE);
+       PRINTK(("%03x: %p(%p) %08x\n", p, ptp, (void *)pte, p << PDSHIFT));
+
+       bufpos = 0;
+       for (j = 0; j < PTES_PER_PTP; j++) {
+               if ((ptp[j] & PG_V) == 0)
+                       continue;
+               pte = ptp[j] /* & PG_FRAME */;
+               bufpos += sprintf(XBUF + bufpos, "%x:%03x:%08x ",
+                   p, j, pte);
+               if (bufpos > 70) {
+                       int k;
+                       sprintf(XBUF + bufpos, "\n");
+                       PRINTK((XBUF));
+                       bufpos = 0;
+                       for (k = 0; k < 1000000; k++);
+               }
+       }
+       if (bufpos) {
+               PRINTK((XBUF));
+               PRINTK(("\n"));
+               bufpos = 0;
+       }
+}
+#endif
+
+void
+xpmap_init(void)
+{
+       pd_entry_t *xen_pdp;
+       pt_entry_t *ptp, *sysptp;
+       pt_entry_t pte;
+       uint32_t i, j;
+       int bufpos;
+#if defined(XENDEBUG_LOW)
+       extern char kernel_text, _etext, __bss_start, end, *esym;
+#endif
+
+       xpmap_phys_to_machine_mapping = (void *)xen_start_info.mfn_list;
+
+       xen_pdp = (pd_entry_t *)xen_start_info.pt_base;
+
+       XENPRINTK(("text %p data %p bss %p end %p esym %p\n", &kernel_text,
+                  &_etext, &__bss_start, &end, esym));
+       XENPRINTK(("xpmap_init PTD %p nkpde %d upages %d xen_PTD %p p2m-map %p\n",
+                  (void *)PTDpaddr, nkpde, UPAGES, xen_pdp,
+                  xpmap_phys_to_machine_mapping));
+
+       bufpos = 0;
+
+       XENPRINTK(("shared_inf %08x\n", (paddr_t)xen_start_info.shared_info));
+       XENPRINTK(("c0100000: %08x\n",
+           xpmap_get_bootpte(0xc0100000)));
+
+       /* Map kernel. */
+
+       /* Map kernel data/bss/tables. */
+
+       /* Map ISA I/O memory. */
+       
+       /* Map kernel PDEs. */
+
+       /* Install a PDE recursively mapping page directory as a page table! */
+
+       sysptp = (pt_entry_t *)(PTDpaddr + ((1 + UPAGES) << PAGE_SHIFT));
+
+       /* make xen's PDE and PTE pages read-only in our pagetable */
+       for (i = 0; i < xen_start_info.nr_pt_frames; i++) {
+               /* mark PTE page read-only in our table */
+               sysptp[((xen_start_info.pt_base +
+                           (i << PAGE_SHIFT) - KERNBASE_LOCORE) & 
+                          (PD_MASK | PT_MASK)) >> PAGE_SHIFT] &= ~PG_RW;
+       }
+
+       xpq_flush_queue();
+
+       for (i = 0; i < 1 + UPAGES + nkpde; i++) {
+               /* mark PTE page read-only in xen's table */
+               ptp = xpmap_get_bootptep(PTDpaddr + (i << PAGE_SHIFT));
+               xpq_queue_pte_update(
+                   (void *)xpmap_ptom((unsigned long)ptp - KERNBASE), *ptp & ~PG_RW);
+               XENPRINTK(("%03x: %p(%p) -> %08x\n", i, ptp,
+                             (unsigned long)ptp - KERNTEXTOFF, *ptp));
+
+               /* mark PTE page read-only in our table */
+               sysptp[((PTDpaddr + (i << PAGE_SHIFT) - KERNBASE_LOCORE) & 
+                          (PD_MASK | PT_MASK)) >> PAGE_SHIFT] &= ~PG_RW;
+
+               /* update our pte's */
+               ptp = (pt_entry_t *)(PTDpaddr + (i << PAGE_SHIFT));
+#if 0
+               pte = xpmap_ptom((uint32_t)ptp - KERNBASE);
+               XENPRINTK(("%03x: %p(%p) %08x\n", i, ptp, pte, i << PDSHIFT));
+#endif
+               for (j = 0; j < PTES_PER_PTP; j++) {
+                       if ((ptp[j] & PG_V) == 0)
+                               continue;
+                       if (ptp[j] == 0xffffffff)
+                               ptp[j] = xen_start_info.shared_info |
+                                       (PG_V|PG_RW);
+                       if (ptp[j] >= KERNTEXTOFF) {
+                               pte = ptp[j];
+                               ptp[j] = (pte & ~PG_FRAME) |
+                                       (xpmap_get_bootpte(pte & PG_FRAME) &
+                                           PG_FRAME);
+                       }
+#if defined(XENDEBUG) && 0
+                       pte = ptp[j] /* & PG_FRAME */;
+                       bufpos += sprintf(XBUF + bufpos, "%x:%03x:%08x ",
+                           i, j, pte);
+                       if (bufpos > 70) {
+                               int k;
+                               sprintf(XBUF + bufpos, "\n");
+                               XENPRINTK((XBUF));
+                               bufpos = 0;
+                               for (k = 0; k < 1000000; k++);
+                       }
+               }
+               if (bufpos) {
+                       XENPRINTK((XBUF));
+                       bufpos = 0;
+#endif
+               }
+               if (i == 0)
+                       i = 1 + UPAGES - 1;
+       }
+
+#if 0
+       for (i = 0x300; i < 0x305; i++)
+               if (((pt_entry_t *)xen_start_info.pt_base)[i] & PG_V)
+                       xpmap_dump_pt((pt_entry_t *)
+                           (xpmap_mtop(((pt_entry_t *)xen_start_info.pt_base)[i] &
+                               PG_FRAME) + KERNBASE), i);
+       xpmap_dump_pt((pt_entry_t *)xen_start_info.pt_base, 0);
+#endif
+
+       XENPRINTK(("switching pdp: %p, %08lx, %p, %p, %p\n", (void *)PTDpaddr,
+                     PTDpaddr - KERNBASE,
+                     (void *)xpmap_ptom(PTDpaddr - KERNBASE),
+                     (void *)xpmap_get_bootpte(PTDpaddr),
+                     (void *)xpmap_mtop(xpmap_ptom(PTDpaddr - KERNBASE))));
+
+#if defined(XENDEBUG)
+       xpmap_dump_pt((pt_entry_t *)PTDpaddr, 0);
+#endif
+
+       xpq_flush_queue();
+
+       xpq_queue_pin_table(xpmap_get_bootpte(PTDpaddr) & PG_FRAME,
+           XPQ_PIN_L2_TABLE);
+       xpq_queue_pt_switch(xpmap_get_bootpte(PTDpaddr) & PG_FRAME);
+       xpq_queue_unpin_table(
+               xpmap_get_bootpte(xen_start_info.pt_base) & PG_FRAME);
+
+       /* make xen's PDE and PTE pages writable in our pagetable */
+       for (i = 0; i < xen_start_info.nr_pt_frames; i++) {
+               /* mark PTE page writable in our table */
+               ptp = &sysptp[((xen_start_info.pt_base +
+                                  (i << PAGE_SHIFT) - KERNBASE_LOCORE) & 
+                                 (PD_MASK | PT_MASK)) >> PAGE_SHIFT];
+               xpq_queue_pte_update(
+                   (void *)xpmap_ptom((unsigned long)ptp - KERNBASE), *ptp |
+                   PG_RW);
+       }
+
+       xpq_flush_queue();
+       XENPRINTK(("pt_switch done!\n"));
+}
+
+/*
+ * Do a binary search to find out where physical memory ends on the
+ * real hardware.  Xen will fail our updates if they are beyond the
+ * last available page (max_page in xen/common/memory.c).
+ */
+paddr_t
+find_pmap_mem_end(vaddr_t va)
+{
+       mmu_update_t r;
+       int start, end, ok;
+       pt_entry_t old;
+
+       start = xen_start_info.nr_pages;
+       end = HYPERVISOR_VIRT_START >> PAGE_SHIFT;
+
+       r.ptr = (unsigned long)&PTE_BASE[x86_btop(va)];
+       old = PTE_BASE[x86_btop(va)];
+
+       while (start + 1 < end) {
+               r.val = (((start + end) / 2) << PAGE_SHIFT) | PG_V;
+
+               if (HYPERVISOR_mmu_update(&r, 1, &ok) < 0)
+                       end = (start + end) / 2;
+               else
+                       start = (start + end) / 2;
+       }
+       r.val = old;
+       if (HYPERVISOR_mmu_update(&r, 1, &ok) < 0)
+               printf("pmap_mem_end find: old update failed %08x\n",
+                   old);
+
+       return end << PAGE_SHIFT;
+}
+
+
+#if 0
+void xpmap_find_memory(paddr_t);
+void
+xpmap_find_memory(paddr_t first_avail)
+{
+       char buf[256];
+       uint32_t i;
+       int bufpos;
+       paddr_t p;
+
+       bufpos = 0;
+       for (i = ((first_avail - KERNTEXTOFF) >> PAGE_SHIFT);
+            i < xen_start_info.nr_pages; i++) {
+               /* if (xpmap_phys_to_machine_mapping[i] */
+               bufpos += sprintf(buf + bufpos, "%03x:%08x:%08x ",
+                   i, (uint32_t)xpmap_phys_to_machine_mapping[i],
+                   (uint32_t)xpmap_mtop(xpmap_phys_to_machine_mapping[i] <<
+                       PAGE_SHIFT));
+               p = xpmap_phys_to_machine_mapping[i];
+               uvm_page_physload(p, p + 1, p, p + 1, VM_FREELIST_DEFAULT);
+                   
+               if (bufpos > 70) {
+                       int k;
+                       sprintf(buf + bufpos, "\n");
+                       XENPRINTK((buf));
+                       bufpos = 0;
+                       for (k = 0; k < 1000000; k++);
+               }
+       }
+       if (bufpos) {
+               XENPRINTK((buf));
+               bufpos = 0;
+       }
+}
+#endif
+
+
+#ifdef XENDEBUG
+void xpq_debug_dump(void);
+#endif
+
+#define XPQUEUE_SIZE 2048
+typedef union xpq_queue {
+       struct {
+               pd_entry_t *ptr;
+               pd_entry_t val;
+       } pde;
+       struct {
+               pt_entry_t *ptr;
+               pt_entry_t val;
+       } pte;
+       struct {
+               paddr_t ptr;
+               uint32_t val;
+       } pa;
+} xpq_queue_t;
+static xpq_queue_t xpq_queue[XPQUEUE_SIZE];
+static int xpq_idx = 0;
+
+void
+xpq_flush_queue()
+{
+       int i, ok;
+
+       XENPRINTK2(("flush queue %p entries %d\n", xpq_queue, xpq_idx));
+       for (i = 0; i < xpq_idx; i++)
+               XENPRINTK2(("%d: %p %08x\n", i, xpq_queue[i].pde.ptr,
+                   xpq_queue[i].pde.val));
+       if (xpq_idx != 0 &&
+           HYPERVISOR_mmu_update((mmu_update_t *)xpq_queue, xpq_idx, &ok) < 0)
+               panic("HYPERVISOR_mmu_update failed\n");
+       xpq_idx = 0;
+}
+
+static inline void
+xpq_increment_idx(void)
+{
+
+       xpq_idx++;
+       if (__predict_false(xpq_idx == XPQUEUE_SIZE))
+               xpq_flush_queue();
+}
+
+void
+xpq_queue_invlpg(vaddr_t va)
+{
+
+       XENPRINTK2(("xpq_queue_invlpg %p\n", (void *)va));
+       xpq_queue[xpq_idx].pa.ptr = (va & PG_FRAME) | MMU_EXTENDED_COMMAND;
+       xpq_queue[xpq_idx].pa.val = MMUEXT_INVLPG;
+       xpq_increment_idx();
+}
+
+void
+xpq_queue_pde_update(pd_entry_t *ptr, pd_entry_t val)
+{
+
+       xpq_queue[xpq_idx].pde.ptr = ptr;
+       xpq_queue[xpq_idx].pde.val = val;
+       xpq_increment_idx();
+}
+
+void
+xpq_queue_pte_update(pt_entry_t *ptr, pt_entry_t val)
+{
+
+       xpq_queue[xpq_idx].pte.ptr = ptr;
+       xpq_queue[xpq_idx].pte.val = val;
+       xpq_increment_idx();
+}
+
+void
+xpq_queue_unchecked_pte_update(pt_entry_t *ptr, pt_entry_t val)
+{
+
+       xpq_queue[xpq_idx].pa.ptr = (paddr_t)ptr | MMU_NORMAL_PT_UPDATE;
+       /* XXXcl UNCHECKED_PT_UPDATE */
+       xpq_queue[xpq_idx].pa.val = val;
+       xpq_increment_idx();
+}
+
+void
+xpq_queue_pt_switch(paddr_t pa)
+{
+
+       XENPRINTK2(("xpq_queue_pt_switch: %p %p\n", (void *)pa, (void *)pa));
+       xpq_queue[xpq_idx].pa.ptr = pa | MMU_EXTENDED_COMMAND;
+       xpq_queue[xpq_idx].pa.val = MMUEXT_NEW_BASEPTR;
+       xpq_increment_idx();
+}
+
+void
+xpq_queue_pin_table(paddr_t pa, int type)
+{
+
+       XENPRINTK2(("xpq_queue_pin_table: %p %p\n", (void *)pa, (void *)pa));
+       xpq_queue[xpq_idx].pa.ptr = pa | MMU_EXTENDED_COMMAND;
+       switch (type) {
+       case XPQ_PIN_L1_TABLE:
+               xpq_queue[xpq_idx].pa.val = MMUEXT_PIN_L1_TABLE;
+               break;
+       case XPQ_PIN_L2_TABLE:
+               xpq_queue[xpq_idx].pa.val = MMUEXT_PIN_L2_TABLE;
+               break;
+       }
+       xpq_increment_idx();
+}
+
+void
+xpq_queue_unpin_table(paddr_t pa)
+{
+
+       XENPRINTK2(("xpq_queue_unpin_table: %p %p\n", (void *)pa, (void *)pa));
+       xpq_queue[xpq_idx].pa.ptr = pa | MMU_EXTENDED_COMMAND;
+       xpq_queue[xpq_idx].pa.val = MMUEXT_UNPIN_TABLE;
+       xpq_increment_idx();
+}
+
+void
+xpq_queue_set_ldt(vaddr_t va, uint32_t entries)
+{
+
+       XENPRINTK2(("xpq_queue_set_ldt\n"));
+       KASSERT(va == (va & PG_FRAME));
+       xpq_queue[xpq_idx].pa.ptr = MMU_EXTENDED_COMMAND | va;
+       xpq_queue[xpq_idx].pa.val = MMUEXT_SET_LDT |
+               (entries << MMUEXT_CMD_SHIFT);
+       xpq_increment_idx();
+}
+
+void
+xpq_queue_tlb_flush()
+{
+
+       XENPRINTK2(("xpq_queue_tlb_flush\n"));
+       xpq_queue[xpq_idx].pa.ptr = MMU_EXTENDED_COMMAND;
+       xpq_queue[xpq_idx].pa.val = MMUEXT_TLB_FLUSH;
+       xpq_increment_idx();
+}
+
+#ifdef XENDEBUG
+void
+xpq_debug_dump()
+{
+       int i;
+
+       XENPRINTK2(("idx: %d\n", xpq_idx));
+       for (i = 0; i < xpq_idx; i++) {
+               sprintf(XBUF, "%p %08x ", xpq_queue[i].pte.ptr,
+                   xpq_queue[i].pte.val);
+               if (++i < xpq_idx)
+                       sprintf(XBUF + strlen(XBUF), "%p %08x ",
+                           xpq_queue[i].pte.ptr, xpq_queue[i].pte.val);
+               if (++i < xpq_idx)
+                       sprintf(XBUF + strlen(XBUF), "%p %08x ",
+                           xpq_queue[i].pte.ptr, xpq_queue[i].pte.val);
+               if (++i < xpq_idx)
+                       sprintf(XBUF + strlen(XBUF), "%p %08x ",
+                           xpq_queue[i].pte.ptr, xpq_queue[i].pte.val);
+               XENPRINTK2(("%d: %s\n", xpq_idx, XBUF));
+       }
+}
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h

new file mode 100644 (file)

index 0000000..cad97f2
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h
@@ -0,0 +1,130 @@
+/*     $NetBSD: frameasm.h,v 1.1 2004/03/11 21:44:08 cl Exp $  */
+/*     NetBSD: frameasm.h,v 1.4 2004/02/20 17:35:01 yamt Exp   */
+
+#ifndef _I386_FRAMEASM_H_
+#define _I386_FRAMEASM_H_
+
+#ifdef _KERNEL_OPT
+#include "opt_multiprocessor.h"
+#endif
+
+/* XXX assym.h */
+#define TRAP_INSTR     int $0x82
+#define __HYPERVISOR_stack_switch          4
+#define __HYPERVISOR_fpu_taskswitch       7
+
+#ifndef TRAPLOG
+#define TLOG           /**/
+#else
+/*
+ * Fill in trap record
+ */
+#define TLOG                                           \
+9:                                                     \
+       movl    %fs:CPU_TLOG_OFFSET, %eax;              \
+       movl    %fs:CPU_TLOG_BASE, %ebx;                \
+       addl    $SIZEOF_TREC,%eax;                      \
+       andl    $SIZEOF_TLOG-1,%eax;                    \
+       addl    %eax,%ebx;                              \
+       movl    %eax,%fs:CPU_TLOG_OFFSET;               \
+       movl    %esp,TREC_SP(%ebx);                     \
+       movl    $9b,TREC_HPC(%ebx);                     \
+       movl    TF_EIP(%esp),%eax;                      \
+       movl    %eax,TREC_IPC(%ebx);                    \
+       rdtsc                   ;                       \
+       movl    %eax,TREC_TSC(%ebx);                    \
+       movl    $MSR_LASTBRANCHFROMIP,%ecx;             \
+       rdmsr                   ;                       \
+       movl    %eax,TREC_LBF(%ebx);                    \
+       incl    %ecx            ;                       \
+       rdmsr                   ;                       \
+       movl    %eax,TREC_LBT(%ebx);                    \
+       incl    %ecx            ;                       \
+       rdmsr                   ;                       \
+       movl    %eax,TREC_IBF(%ebx);                    \
+       incl    %ecx            ;                       \
+       rdmsr                   ;                       \
+       movl    %eax,TREC_IBT(%ebx)
+#endif
+               
+/*
+ * These are used on interrupt or trap entry or exit.
+ */
+#define        INTRENTRY \
+       cld; \
+       subl    $TF_PUSHSIZE,%esp       ; \
+       movl    %gs,TF_GS(%esp) ; \
+       movl    %fs,TF_FS(%esp) ; \
+       movl    %eax,TF_EAX(%esp)       ; \
+       movl    %es,TF_ES(%esp) ; \
+       movl    %ds,TF_DS(%esp) ; \
+       movl    $GSEL(GDATA_SEL, SEL_KPL),%eax  ; \
+       movl    %edi,TF_EDI(%esp)       ; \
+       movl    %esi,TF_ESI(%esp)       ; \
+       movl    %eax,%ds        ; \
+       movl    %ebp,TF_EBP(%esp)       ; \
+       movl    %eax,%es        ; \
+       movl    %ebx,TF_EBX(%esp)       ; \
+       movl    %eax,%gs        ; \
+       movl    %edx,TF_EDX(%esp)       ; \
+       movl    $GSEL(GCPU_SEL, SEL_KPL),%eax   ; \
+       movl    %ecx,TF_ECX(%esp)       ; \
+       movl    %eax,%fs        ; \
+       TLOG
+
+#define        INTRFASTEXIT \
+       movl    TF_GS(%esp),%gs ; \
+       movl    TF_FS(%esp),%fs ; \
+       movl    TF_ES(%esp),%es ; \
+       movl    TF_DS(%esp),%ds ; \
+       movl    TF_EDI(%esp),%edi       ; \
+       movl    TF_ESI(%esp),%esi       ; \
+       movl    TF_EBP(%esp),%ebp       ; \
+       movl    TF_EBX(%esp),%ebx       ; \
+       movl    TF_EDX(%esp),%edx       ; \
+       movl    TF_ECX(%esp),%ecx       ; \
+       movl    TF_EAX(%esp),%eax       ; \
+       addl    $(TF_PUSHSIZE+8),%esp   ; \
+       iret
+
+#define        DO_DEFERRED_SWITCH(reg) \
+       cmpl    $0, CPUVAR(WANT_PMAPLOAD)               ; \
+       jz      1f                                      ; \
+       call    _C_LABEL(pmap_load)                     ; \
+       1:
+
+#define        CHECK_DEFERRED_SWITCH(reg) \
+       cmpl    $0, CPUVAR(WANT_PMAPLOAD)
+
+#define        CHECK_ASTPENDING(reg)   movl    CPUVAR(CURLWP),reg      ; \
+                               cmpl    $0, reg                 ; \
+                               je      1f                      ; \
+                               movl    L_PROC(reg),reg         ; \
+                               cmpl    $0, P_MD_ASTPENDING(reg); \
+                               1:
+#define        CLEAR_ASTPENDING(reg)   movl    $0, P_MD_ASTPENDING(reg)
+
+#if !defined(XEN)
+#define        CLI(reg)        cli
+#define        STI(reg)        sti
+#else
+/* XXX assym.h */
+#define        EVENTS_MASK 136
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending          /* 0 */
+#define evtchn_upcall_mask             1
+
+#define XEN_BLOCK_EVENTS(reg)  movb $1,evtchn_upcall_mask(reg)
+#define XEN_UNBLOCK_EVENTS(reg)        movb $0,evtchn_upcall_mask(reg)
+#define XEN_TEST_PENDING(reg)  testb $0xFF,evtchn_upcall_pending(%reg)
+
+#define CLI(reg)       movl    _C_LABEL(HYPERVISOR_shared_info),reg ;  \
+                       XEN_BLOCK_EVENTS(reg)
+#define STI(reg)       movl    _C_LABEL(HYPERVISOR_shared_info),reg ;  \
+                       XEN_UNBLOCK_EVENTS(reg)
+#define STIC(reg)      movl    _C_LABEL(HYPERVISOR_shared_info),reg ;  \
+                       XEN_UNBLOCK_EVENTS(reg)  ; \
+                       testb $1,evtchn_upcall_pending(reg)
+#endif
+
+#endif /* _I386_FRAMEASM_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h

new file mode 100644 (file)

index 0000000..13442d2
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h
@@ -0,0 +1,423 @@
+/*     $NetBSD: hypervisor.h,v 1.1.2.2 2004/06/17 09:23:19 tron Exp $  */
+
+/*
+ * 
+ * Communication to/from hypervisor.
+ * 
+ * Copyright (c) 2002-2003, K A Fraser
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef _XEN_HYPERVISOR_H_
+#define _XEN_HYPERVISOR_H_
+
+
+struct hypervisor_attach_args {
+       const char              *haa_busname;
+};
+
+struct xencons_attach_args {
+       const char              *xa_device;
+};
+
+struct xen_npx_attach_args {
+       const char              *xa_device;
+};
+
+
+#define        u8 uint8_t
+#define        u16 uint16_t
+#define        u32 uint32_t
+#define        u64 uint64_t
+#define        s8 int8_t
+#define        s16 int16_t
+#define        s32 int32_t
+#define        s64 int64_t
+
+/* include the hypervisor interface */
+#include <sys/systm.h>
+#include <machine/hypervisor-ifs/hypervisor-if.h>
+#include <machine/hypervisor-ifs/dom0_ops.h>
+#include <machine/hypervisor-ifs/event_channel.h>
+#include <machine/hypervisor-ifs/io/domain_controller.h>
+#include <machine/hypervisor-ifs/io/netif.h>
+
+#undef u8
+#undef u16
+#undef u32
+#undef u64
+#undef s8
+#undef s16
+#undef s32
+#undef s64
+
+
+/*
+ * a placeholder for the start of day information passed up from the hypervisor
+ */
+union start_info_union
+{
+    start_info_t start_info;
+    char padding[512];
+};
+extern union start_info_union start_info_union;
+#define xen_start_info (start_info_union.start_info)
+
+
+/* hypervisor.c */
+void do_hypervisor_callback(struct trapframe *regs);
+void hypervisor_notify_via_evtchn(unsigned int);
+void hypervisor_enable_irq(unsigned int);
+void hypervisor_disable_irq(unsigned int);
+void hypervisor_acknowledge_irq(unsigned int);
+
+/* hypervisor_machdep.c */
+void hypervisor_unmask_event(unsigned int);
+void hypervisor_mask_event(unsigned int);
+void hypervisor_clear_event(unsigned int);
+void hypervisor_force_callback(void);
+
+/*
+ * Assembler stubs for hyper-calls.
+ */
+
+static inline int HYPERVISOR_set_trap_table(trap_info_t *table)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_trap_table),
+        "b" (table) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int count,
+                                       int *success_count)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_mmu_update), 
+        "b" (req), "c" (count), "d" (success_count) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_gdt), 
+        "b" (frame_list), "c" (entries) : "memory" );
+
+
+    return ret;
+}
+
+static inline int HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_stack_switch),
+        "b" (ss), "c" (esp) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_set_callbacks(
+    unsigned long event_selector, unsigned long event_address,
+    unsigned long failsafe_selector, unsigned long failsafe_address)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_callbacks),
+        "b" (event_selector), "c" (event_address), 
+        "d" (failsafe_selector), "S" (failsafe_address) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_fpu_taskswitch(void)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_fpu_taskswitch) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_yield(void)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_yield) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_block(void)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_block) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_shutdown(void)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_shutdown | (SHUTDOWN_poweroff << SCHEDOP_reasonshift))
+        : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_reboot(void)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_shutdown | (SHUTDOWN_reboot << SCHEDOP_reasonshift))
+        : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_suspend(unsigned long srec)
+{
+    int ret;
+    /* NB. On suspend, control software expects a suspend record in %esi. */
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_shutdown | (SHUTDOWN_suspend << SCHEDOP_reasonshift)), 
+        "S" (srec) : "memory" );
+
+    return ret;
+}
+
+static inline long HYPERVISOR_set_timer_op(uint64_t timeout)
+{
+    int ret;
+    unsigned long timeout_hi = (unsigned long)(timeout>>32);
+    unsigned long timeout_lo = (unsigned long)timeout;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_timer_op),
+        "b" (timeout_hi), "c" (timeout_lo) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_dom0_op(dom0_op_t *dom0_op)
+{
+    int ret;
+    dom0_op->interface_version = DOM0_INTERFACE_VERSION;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_dom0_op),
+        "b" (dom0_op) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_set_debugreg(int reg, unsigned long value)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_debugreg),
+        "b" (reg), "c" (value) : "memory" );
+
+    return ret;
+}
+
+static inline unsigned long HYPERVISOR_get_debugreg(int reg)
+{
+    unsigned long ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_get_debugreg),
+        "b" (reg) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_update_descriptor(
+    unsigned long pa, unsigned long word1, unsigned long word2)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_update_descriptor), 
+        "b" (pa), "c" (word1), "d" (word2) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_set_fast_trap(int idx)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_fast_trap), 
+        "b" (idx) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_dom_mem_op(unsigned int   op,
+                                        unsigned long *extent_list,
+                                        unsigned long  nr_extents,
+                                        unsigned int   extent_order)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_dom_mem_op),
+        "b" (op), "c" (extent_list), "d" (nr_extents), "S" (extent_order),
+       "D" (DOMID_SELF)
+        : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_multicall(void *call_list, int nr_calls)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_multicall),
+        "b" (call_list), "c" (nr_calls) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_update_va_mapping(
+    unsigned long page_nr, unsigned long new_val, unsigned long flags)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_update_va_mapping), 
+        "b" (page_nr), "c" (new_val), "d" (flags) : "memory" );
+
+    if (__predict_false(ret < 0))
+        panic("Failed update VA mapping: %08lx, %08lx, %08lx",
+              page_nr, new_val, flags);
+
+    return ret;
+}
+
+static inline int HYPERVISOR_event_channel_op(void *op)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_event_channel_op),
+        "b" (op) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_xen_version(int cmd)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_xen_version), 
+        "b" (cmd) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_console_io(int cmd, int count, char *str)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_console_io),
+        "b" (cmd), "c" (count), "d" (str) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_physdev_op(void *physdev_op)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_physdev_op),
+        "b" (physdev_op) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_grant_table_op(void *gnttab_op)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_grant_table_op),
+        "b" (gnttab_op) : "memory" );
+
+    return ret;
+}
+
+static inline int HYPERVISOR_update_va_mapping_otherdomain(
+    unsigned long page_nr, unsigned long new_val, unsigned long flags, domid_t domid)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_update_va_mapping_otherdomain), 
+        "b" (page_nr), "c" (new_val), "d" (flags), "S" (domid) :
+        "memory" );
+    
+    return ret;
+}
+
+static inline int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_vm_assist),
+        "b" (cmd), "c" (type) : "memory" );
+
+    return ret;
+}
+
+#endif /* _XEN_HYPERVISOR_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h

new file mode 100644 (file)

index 0000000..32a774b
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h
@@ -0,0 +1,110 @@
+/*     $NetBSD: if_xennetvar.h,v 1.1.2.1 2004/05/22 15:59:31 he Exp $  */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _XEN_IF_XENNETVAR_H_
+#define _XEN_IF_XENNETVAR_H_
+
+#include <machine/xen.h>
+
+union xennet_bufarray {
+       struct {
+               struct mbuf *xbtx_m;
+       } xb_tx;
+       struct {
+               vaddr_t xbrx_va;
+               paddr_t xbrx_pa;
+               struct xennet_softc *xbrx_sc;
+       } xb_rx;
+       int xb_next;
+};
+
+struct xennet_txbuf {
+       SLIST_ENTRY(xennet_txbuf)       xt_next;
+       struct xennet_softc             *xt_sc;
+       paddr_t                         xt_pa;
+       u_char                          xt_buf[0];
+};
+#define        TXBUF_PER_PAGE 2
+#define        TXBUF_BUFSIZE   (PAGE_SIZE / TXBUF_PER_PAGE) - sizeof(struct xennet_txbuf)
+
+struct xennet_softc {
+       struct device           sc_dev;         /* base device glue */
+       struct ethercom         sc_ethercom;    /* Ethernet common part */
+
+       int                     sc_ifno;
+
+       uint8_t                 sc_enaddr[6];
+
+#ifdef mediacode
+       struct ifmedia          sc_media;
+#endif
+
+       /* What is the status of our connection to the remote backend? */
+#define BEST_CLOSED       0
+#define BEST_DISCONNECTED 1
+#define BEST_CONNECTED    2
+       unsigned int            sc_backend_state;
+
+       unsigned int            sc_evtchn;
+       unsigned int            sc_irq;
+
+       netif_tx_interface_t    *sc_tx;
+       netif_rx_interface_t    *sc_rx;
+
+       uint32_t                sc_tx_entries;
+       uint32_t                sc_tx_resp_cons;
+
+       uint32_t                sc_rx_resp_cons;
+       uint32_t                sc_rx_bufs_to_notify;
+
+       union xennet_bufarray   sc_tx_bufa[NETIF_TX_RING_SIZE];
+       union xennet_bufarray   sc_rx_bufa[NETIF_TX_RING_SIZE];
+
+       SLIST_HEAD(, xennet_txbuf)      sc_tx_bufs;
+};
+
+struct xennet_attach_args {
+       const char              *xa_device;
+       int                     xa_handle;
+};
+
+struct nfs_diskless;
+
+int xennet_scan(struct device *, struct xennet_attach_args *, cfprint_t);
+void xennet_start(struct ifnet *);
+int xennet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
+void xennet_watchdog(struct ifnet *ifp);
+int xennet_bootstatic_callback(struct nfs_diskless *);
+
+#endif /* _XEN_IF_XENNETVAR_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h

new file mode 100644 (file)

index 0000000..1a482ea
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h
@@ -0,0 +1,533 @@
+/*     $NetBSD: pmap.h,v 1.1.2.1 2004/05/22 15:59:58 he Exp $  */
+/*     NetBSD: pmap.h,v 1.79 2004/02/20 17:35:01 yamt Exp      */
+
+/*
+ *
+ * Copyright (c) 1997 Charles D. Cranor and Washington University.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgment:
+ *      This product includes software developed by Charles D. Cranor and
+ *      Washington University.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * pmap.h: see pmap.c for the history of this pmap module.
+ */
+
+#ifndef        _I386_PMAP_H_
+#define        _I386_PMAP_H_
+
+#if defined(_KERNEL_OPT)
+#include "opt_user_ldt.h"
+#include "opt_largepages.h"
+#endif
+
+#include "opt_xen.h"
+
+#include <machine/cpufunc.h>
+#include <machine/pte.h>
+#include <machine/xenfunc.h>
+#include <machine/xenpmap.h>
+#include <machine/segments.h>
+#include <uvm/uvm_object.h>
+
+/*
+ * see pte.h for a description of i386 MMU terminology and hardware
+ * interface.
+ *
+ * a pmap describes a processes' 4GB virtual address space.  this
+ * virtual address space can be broken up into 1024 4MB regions which
+ * are described by PDEs in the PDP.  the PDEs are defined as follows:
+ *
+ * (ranges are inclusive -> exclusive, just like vm_map_entry start/end)
+ * (the following assumes that KERNBASE is 0xc0000000)
+ *
+ * PDE#s       VA range                usage
+ * 0->766      0x0 -> 0xbfc00000       user address space
+ * 767         0xbfc00000->            recursive mapping of PDP (used for
+ *                     0xc0000000      linear mapping of PTPs)
+ * 768->1023   0xc0000000->            kernel address space (constant
+ *                     0xffc00000      across all pmap's/processes)
+ * 1023                0xffc00000->            "alternate" recursive PDP mapping
+ *                     <end>           (for other pmaps)
+ *
+ *
+ * note: a recursive PDP mapping provides a way to map all the PTEs for
+ * a 4GB address space into a linear chunk of virtual memory.  in other
+ * words, the PTE for page 0 is the first int mapped into the 4MB recursive
+ * area.  the PTE for page 1 is the second int.  the very last int in the
+ * 4MB range is the PTE that maps VA 0xffffe000 (the last page in a 4GB
+ * address).
+ *
+ * all pmap's PD's must have the same values in slots 768->1023 so that
+ * the kernel is always mapped in every process.  these values are loaded
+ * into the PD at pmap creation time.
+ *
+ * at any one time only one pmap can be active on a processor.  this is
+ * the pmap whose PDP is pointed to by processor register %cr3.  this pmap
+ * will have all its PTEs mapped into memory at the recursive mapping
+ * point (slot #767 as show above).  when the pmap code wants to find the
+ * PTE for a virtual address, all it has to do is the following:
+ *
+ * address of PTE = (767 * 4MB) + (VA / PAGE_SIZE) * sizeof(pt_entry_t)
+ *                = 0xbfc00000 + (VA / 4096) * 4
+ *
+ * what happens if the pmap layer is asked to perform an operation
+ * on a pmap that is not the one which is currently active?  in that
+ * case we take the PA of the PDP of non-active pmap and put it in
+ * slot 1023 of the active pmap.  this causes the non-active pmap's
+ * PTEs to get mapped in the final 4MB of the 4GB address space
+ * (e.g. starting at 0xffc00000).
+ *
+ * the following figure shows the effects of the recursive PDP mapping:
+ *
+ *   PDP (%cr3)
+ *   +----+
+ *   |   0| -> PTP#0 that maps VA 0x0 -> 0x400000
+ *   |    |
+ *   |    |
+ *   | 767| -> points back to PDP (%cr3) mapping VA 0xbfc00000 -> 0xc0000000
+ *   | 768| -> first kernel PTP (maps 0xc0000000 -> 0xf0400000)
+ *   |    |
+ *   |1023| -> points to alternate pmap's PDP (maps 0xffc00000 -> end)
+ *   +----+
+ *
+ * note that the PDE#767 VA (0xbfc00000) is defined as "PTE_BASE"
+ * note that the PDE#1023 VA (0xffc00000) is defined as "APTE_BASE"
+ *
+ * starting at VA 0xbfc00000 the current active PDP (%cr3) acts as a
+ * PTP:
+ *
+ * PTP#767 == PDP(%cr3) => maps VA 0xbfc00000 -> 0xc0000000
+ *   +----+
+ *   |   0| -> maps the contents of PTP#0 at VA 0xbfc00000->0xbfc01000
+ *   |    |
+ *   |    |
+ *   | 767| -> maps contents of PTP#767 (the PDP) at VA 0xbffbf000
+ *   | 768| -> maps contents of first kernel PTP
+ *   |    |
+ *   |1023|
+ *   +----+
+ *
+ * note that mapping of the PDP at PTP#767's VA (0xbffbf000) is
+ * defined as "PDP_BASE".... within that mapping there are two
+ * defines:
+ *   "PDP_PDE" (0xbfeffbfc) is the VA of the PDE in the PDP
+ *      which points back to itself.
+ *   "APDP_PDE" (0xbfeffffc) is the VA of the PDE in the PDP which
+ *      establishes the recursive mapping of the alternate pmap.
+ *      to set the alternate PDP, one just has to put the correct
+ *     PA info in *APDP_PDE.
+ *
+ * note that in the APTE_BASE space, the APDP appears at VA
+ * "APDP_BASE" (0xfffff000).
+ */
+/* XXX MP should we allocate one APDP_PDE per processor?? */
+
+/*
+ * the following defines identify the slots used as described above.
+ */
+
+#define PDSLOT_PTE     ((KERNBASE/NBPD)-1) /* 767: for recursive PDP map */
+#define PDSLOT_KERN    (KERNBASE/NBPD)     /* 768: start of kernel space */
+#define PDSLOT_APTE    ((unsigned)1023-16) /* 1023: alternative recursive slot */
+
+/*
+ * the following defines give the virtual addresses of various MMU
+ * data structures:
+ * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings
+ * PTD_BASE and APTD_BASE: the base VA of the recursive mapping of the PTD
+ * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP
+ */
+
+#define PTE_BASE       ((pt_entry_t *)  (PDSLOT_PTE * NBPD) )
+#define APTE_BASE      ((pt_entry_t *)  (PDSLOT_APTE * NBPD) )
+#define PDP_BASE ((pd_entry_t *)(((char *)PTE_BASE) + (PDSLOT_PTE * PAGE_SIZE)))
+#define APDP_BASE ((pd_entry_t *)(((char *)APTE_BASE) + (PDSLOT_APTE * PAGE_SIZE)))
+#define PDP_PDE                (PDP_BASE + PDSLOT_PTE)
+#define APDP_PDE       (PDP_BASE + PDSLOT_APTE)
+
+/*
+ * the follow define determines how many PTPs should be set up for the
+ * kernel by locore.s at boot time.  this should be large enough to
+ * get the VM system running.  once the VM system is running, the
+ * pmap module can add more PTPs to the kernel area on demand.
+ */
+
+#ifndef NKPTP
+#define NKPTP          4       /* 16MB to start */
+#endif
+#define NKPTP_MIN      4       /* smallest value we allow */
+#define NKPTP_MAX      (1024 - (KERNBASE/NBPD) - 1)
+                               /* largest value (-1 for APTP space) */
+
+/*
+ * pdei/ptei: generate index into PDP/PTP from a VA
+ */
+#define        pdei(VA)        (((VA) & PD_MASK) >> PDSHIFT)
+#define        ptei(VA)        (((VA) & PT_MASK) >> PGSHIFT)
+
+/*
+ * PTP macros:
+ *   a PTP's index is the PD index of the PDE that points to it
+ *   a PTP's offset is the byte-offset in the PTE space that this PTP is at
+ *   a PTP's VA is the first VA mapped by that PTP
+ *
+ * note that PAGE_SIZE == number of bytes in a PTP (4096 bytes == 1024 entries)
+ *           NBPD == number of bytes a PTP can map (4MB)
+ */
+
+#define ptp_i2o(I)     ((I) * PAGE_SIZE)       /* index => offset */
+#define ptp_o2i(O)     ((O) / PAGE_SIZE)       /* offset => index */
+#define ptp_i2v(I)     ((I) * NBPD)    /* index => VA */
+#define ptp_v2i(V)     ((V) / NBPD)    /* VA => index (same as pdei) */
+
+/*
+ * PG_AVAIL usage: we make use of the ignored bits of the PTE
+ */
+
+#define PG_W           PG_AVAIL1       /* "wired" mapping */
+#define PG_PVLIST      PG_AVAIL2       /* mapping has entry on pvlist */
+#define PG_X           PG_AVAIL3       /* executable mapping */
+
+/*
+ * Number of PTE's per cache line.  4 byte pte, 32-byte cache line
+ * Used to avoid false sharing of cache lines.
+ */
+#define NPTECL                 8
+
+#ifdef _KERNEL
+/*
+ * pmap data structures: see pmap.c for details of locking.
+ */
+
+struct pmap;
+typedef struct pmap *pmap_t;
+
+/*
+ * we maintain a list of all non-kernel pmaps
+ */
+
+LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */
+
+/*
+ * the pmap structure
+ *
+ * note that the pm_obj contains the simple_lock, the reference count,
+ * page list, and number of PTPs within the pmap.
+ *
+ * XXX If we ever support processor numbers higher than 31, we'll have
+ * XXX to rethink the CPU mask.
+ */
+
+struct pmap {
+       struct uvm_object pm_obj;       /* object (lck by object lock) */
+#define        pm_lock pm_obj.vmobjlock
+       LIST_ENTRY(pmap) pm_list;       /* list (lck by pm_list lock) */
+       pd_entry_t *pm_pdir;            /* VA of PD (lck by object lock) */
+       u_int32_t pm_pdirpa;            /* PA of PD (read-only after create) */
+       struct vm_page *pm_ptphint;     /* pointer to a PTP in our pmap */
+       struct pmap_statistics pm_stats;  /* pmap stats (lck by object lock) */
+
+       vaddr_t pm_hiexec;              /* highest executable mapping */
+       int pm_flags;                   /* see below */
+
+       union descriptor *pm_ldt;       /* user-set LDT */
+       int pm_ldt_len;                 /* number of LDT entries */
+       int pm_ldt_sel;                 /* LDT selector */
+       u_int32_t pm_cpus;              /* mask of CPUs using pmap */
+};
+
+/* pm_flags */
+#define        PMF_USER_LDT    0x01    /* pmap has user-set LDT */
+
+/*
+ * for each managed physical page we maintain a list of <PMAP,VA>'s
+ * which it is mapped at.  the list is headed by a pv_head structure.
+ * there is one pv_head per managed phys page (allocated at boot time).
+ * the pv_head structure points to a list of pv_entry structures (each
+ * describes one mapping).
+ */
+
+struct pv_entry {                      /* locked by its list's pvh_lock */
+       SPLAY_ENTRY(pv_entry) pv_node;  /* splay-tree node */
+       struct pmap *pv_pmap;           /* the pmap */
+       vaddr_t pv_va;                  /* the virtual address */
+       struct vm_page *pv_ptp;         /* the vm_page of the PTP */
+};
+
+/*
+ * pv_entrys are dynamically allocated in chunks from a single page.
+ * we keep track of how many pv_entrys are in use for each page and
+ * we can free pv_entry pages if needed.  there is one lock for the
+ * entire allocation system.
+ */
+
+struct pv_page_info {
+       TAILQ_ENTRY(pv_page) pvpi_list;
+       struct pv_entry *pvpi_pvfree;
+       int pvpi_nfree;
+};
+
+/*
+ * number of pv_entry's in a pv_page
+ * (note: won't work on systems where NPBG isn't a constant)
+ */
+
+#define PVE_PER_PVPAGE ((PAGE_SIZE - sizeof(struct pv_page_info)) / \
+                       sizeof(struct pv_entry))
+
+/*
+ * a pv_page: where pv_entrys are allocated from
+ */
+
+struct pv_page {
+       struct pv_page_info pvinfo;
+       struct pv_entry pvents[PVE_PER_PVPAGE];
+};
+
+/*
+ * global kernel variables
+ */
+
+/* PTDpaddr: is the physical address of the kernel's PDP */
+extern u_long PTDpaddr;
+
+extern struct pmap kernel_pmap_store;  /* kernel pmap */
+extern int nkpde;                      /* current # of PDEs for kernel */
+extern int pmap_pg_g;                  /* do we support PG_G? */
+
+/*
+ * macros
+ */
+
+#define        pmap_kernel()                   (&kernel_pmap_store)
+#define        pmap_resident_count(pmap)       ((pmap)->pm_stats.resident_count)
+#define        pmap_wired_count(pmap)          ((pmap)->pm_stats.wired_count)
+#define        pmap_update(pmap)               /* nothing (yet) */
+
+#define pmap_clear_modify(pg)          pmap_clear_attrs(pg, PG_M)
+#define pmap_clear_reference(pg)       pmap_clear_attrs(pg, PG_U)
+#define pmap_copy(DP,SP,D,L,S)
+#define pmap_is_modified(pg)           pmap_test_attrs(pg, PG_M)
+#define pmap_is_referenced(pg)         pmap_test_attrs(pg, PG_U)
+#define pmap_move(DP,SP,D,L,S)
+#define pmap_phys_address(ppn)         x86_ptob(ppn)
+#define pmap_valid_entry(E)            ((E) & PG_V) /* is PDE or PTE valid? */
+
+
+/*
+ * prototypes
+ */
+
+void           pmap_activate(struct lwp *);
+void           pmap_bootstrap(vaddr_t);
+boolean_t      pmap_clear_attrs(struct vm_page *, int);
+void           pmap_deactivate(struct lwp *);
+void           pmap_deactivate2(struct lwp *);
+void           pmap_page_remove (struct vm_page *);
+void           pmap_remove(struct pmap *, vaddr_t, vaddr_t);
+boolean_t      pmap_test_attrs(struct vm_page *, int);
+void           pmap_write_protect(struct pmap *, vaddr_t, vaddr_t, vm_prot_t);
+int            pmap_exec_fixup(struct vm_map *, struct trapframe *,
+                   struct pcb *);
+void           pmap_load(void);
+int            pmap_enter_ma(struct pmap *, vaddr_t, paddr_t, vm_prot_t,
+                   int);
+
+vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
+
+void   pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, int32_t *);
+void   pmap_tlb_shootnow(int32_t);
+void   pmap_do_tlb_shootdown(struct cpu_info *);
+
+#define PMAP_GROWKERNEL                /* turn on pmap_growkernel interface */
+
+/*
+ * Do idle page zero'ing uncached to avoid polluting the cache.
+ */
+boolean_t                      pmap_pageidlezero(paddr_t);
+#define        PMAP_PAGEIDLEZERO(pa)   pmap_pageidlezero((pa))
+
+/*
+ * inline functions
+ */
+
+/*ARGSUSED*/
+static __inline void
+pmap_remove_all(struct pmap *pmap)
+{
+       /* Nothing. */
+}
+
+/*
+ * pmap_update_pg: flush one page from the TLB (or flush the whole thing
+ *     if hardware doesn't support one-page flushing)
+ */
+
+__inline static void __attribute__((__unused__))
+pmap_update_pg(vaddr_t va)
+{
+#if defined(I386_CPU)
+       if (cpu_class == CPUCLASS_386)
+               tlbflush();
+       else
+#endif
+               invlpg((u_int) va);
+}
+
+/*
+ * pmap_update_2pg: flush two pages from the TLB
+ */
+
+__inline static void __attribute__((__unused__))
+pmap_update_2pg(vaddr_t va, vaddr_t vb)
+{
+#if defined(I386_CPU)
+       if (cpu_class == CPUCLASS_386)
+               tlbflush();
+       else
+#endif
+       {
+               invlpg((u_int) va);
+               invlpg((u_int) vb);
+       }
+}
+
+/*
+ * pmap_page_protect: change the protection of all recorded mappings
+ *     of a managed page
+ *
+ * => this function is a frontend for pmap_page_remove/pmap_clear_attrs
+ * => we only have to worry about making the page more protected.
+ *     unprotecting a page is done on-demand at fault time.
+ */
+
+__inline static void __attribute__((__unused__))
+pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
+{
+       if ((prot & VM_PROT_WRITE) == 0) {
+               if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
+                       (void) pmap_clear_attrs(pg, PG_RW);
+               } else {
+                       pmap_page_remove(pg);
+               }
+       }
+}
+
+/*
+ * pmap_protect: change the protection of pages in a pmap
+ *
+ * => this function is a frontend for pmap_remove/pmap_write_protect
+ * => we only have to worry about making the page more protected.
+ *     unprotecting a page is done on-demand at fault time.
+ */
+
+__inline static void __attribute__((__unused__))
+pmap_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
+{
+       if ((prot & VM_PROT_WRITE) == 0) {
+               if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
+                       pmap_write_protect(pmap, sva, eva, prot);
+               } else {
+                       pmap_remove(pmap, sva, eva);
+               }
+       }
+}
+
+/*
+ * various address inlines
+ *
+ *  vtopte: return a pointer to the PTE mapping a VA, works only for
+ *  user and PT addresses
+ *
+ *  kvtopte: return a pointer to the PTE mapping a kernel VA
+ */
+
+#include <lib/libkern/libkern.h>
+
+static __inline pt_entry_t * __attribute__((__unused__))
+vtopte(vaddr_t va)
+{
+
+       KASSERT(va < (PDSLOT_KERN << PDSHIFT));
+
+       return (PTE_BASE + x86_btop(va));
+}
+
+static __inline pt_entry_t * __attribute__((__unused__))
+kvtopte(vaddr_t va)
+{
+
+       KASSERT(va >= (PDSLOT_KERN << PDSHIFT));
+
+#ifdef LARGEPAGES
+       {
+               pd_entry_t *pde;
+
+               pde = PDP_BASE + pdei(va);
+               if (*pde & PG_PS)
+                       return ((pt_entry_t *)pde);
+       }
+#endif
+
+       return (PTE_BASE + x86_btop(va));
+}
+
+/*
+ * vtomach: virtual address to machine address.  For use by
+ * machine-dependent code only.
+ */
+
+static inline paddr_t __attribute__((__unused__))
+vtomach(vaddr_t va)
+{
+       pt_entry_t pte;
+
+       pte = PTE_GET(&PTE_BASE[x86_btop(va)]);
+       return xpmap_ptom((pte & PG_FRAME) | (va & ~PG_FRAME));
+}
+
+#define pmap_cpu_has_pg_n()            (cpu_class != CPUCLASS_386)
+#define pmap_cpu_has_invlpg()          (cpu_class != CPUCLASS_386)
+
+paddr_t vtophys(vaddr_t);
+vaddr_t        pmap_map(vaddr_t, paddr_t, paddr_t, vm_prot_t);
+
+void   pmap_kenter_ma(vaddr_t, paddr_t, vm_prot_t);
+
+#if defined(USER_LDT)
+void   pmap_ldt_cleanup(struct lwp *);
+#define        PMAP_FORK
+#endif /* USER_LDT */
+
+/* 
+ * Hooks for the pool allocator.
+ */
+#define        POOL_VTOPHYS(va)        vtophys((vaddr_t) (va))
+
+#endif /* _KERNEL */
+#endif /* _I386_PMAP_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h

new file mode 100644 (file)

index 0000000..48bff48
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h
@@ -0,0 +1,247 @@
+/*     $NetBSD: xen.h,v 1.1.2.2 2004/06/17 09:23:19 tron Exp $ */
+
+/*
+ *
+ * Copyright (c) 2003, 2004 Keir Fraser (on behalf of the Xen team)
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef _XEN_H
+#define _XEN_H
+
+#ifndef _LOCORE
+
+struct xen_netinfo {
+       uint32_t xi_ifno;
+       char *xi_root;
+       uint32_t xi_ip[5];
+};
+
+union xen_cmdline_parseinfo {
+       char                    xcp_bootdev[16]; /* sizeof(dv_xname) */
+       struct xen_netinfo      xcp_netinfo;
+       char                    xcp_console[16];
+};
+
+#define        XEN_PARSE_BOOTDEV       0
+#define        XEN_PARSE_NETINFO       1
+#define        XEN_PARSE_CONSOLE       2
+
+void   xen_parse_cmdline(int, union xen_cmdline_parseinfo *);
+
+void   xenconscn_attach(void);
+
+void   xenmachmem_init(void);
+void   xenprivcmd_init(void);
+void   xenvfr_init(void);
+
+#ifdef XENDEBUG
+void printk(const char *, ...);
+void vprintk(const char *, va_list);
+#endif
+
+#endif
+
+#endif /* _XEN_H */
+
+/******************************************************************************
+ * os.h
+ * 
+ * random collection of macros and definition
+ */
+
+#ifndef _OS_H_
+#define _OS_H_
+
+/*
+ * These are the segment descriptors provided for us by the hypervisor.
+ * For now, these are hardwired -- guest OSes cannot update the GDT
+ * or LDT.
+ * 
+ * It shouldn't be hard to support descriptor-table frobbing -- let me 
+ * know if the BSD or XP ports require flexibility here.
+ */
+
+
+/*
+ * these are also defined in hypervisor-if.h but can't be pulled in as
+ * they are used in start of day assembly. Need to clean up the .h files
+ * a bit more...
+ */
+
+#ifndef FLAT_RING1_CS
+#define FLAT_RING1_CS          0x0819
+#define FLAT_RING1_DS          0x0821
+#define FLAT_RING3_CS          0x082b
+#define FLAT_RING3_DS          0x0833
+#endif
+
+#define __KERNEL_CS        FLAT_RING1_CS
+#define __KERNEL_DS        FLAT_RING1_DS
+
+/* Everything below this point is not included by assembler (.S) files. */
+#ifndef _LOCORE
+
+/* some function prototypes */
+void trap_init(void);
+
+
+/*
+ * STI/CLI equivalents. These basically set and clear the virtual
+ * event_enable flag in the shared_info structure. Note that when
+ * the enable bit is set, there may be pending events to be handled.
+ * We may therefore call into do_hypervisor_callback() directly.
+ */
+
+#define __save_flags(x)                                                        \
+do {                                                                   \
+       (x) = HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask;  \
+} while (0)
+
+#define __restore_flags(x)                                             \
+do {                                                                   \
+       shared_info_t *_shared = HYPERVISOR_shared_info;                \
+       __insn_barrier();                                               \
+       if ((_shared->vcpu_data[0].evtchn_upcall_mask = (x)) == 0) {    \
+               __insn_barrier();                                       \
+               if (__predict_false(_shared->vcpu_data[0].evtchn_upcall_pending)) \
+                       hypervisor_force_callback();                    \
+       }                                                               \
+} while (0)
+
+#define __cli()                                                                \
+do {                                                                   \
+       HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask = 1;    \
+       __insn_barrier();                                               \
+} while (0)
+
+#define __sti()                                                                \
+do {                                                                   \
+       shared_info_t *_shared = HYPERVISOR_shared_info;                \
+       __insn_barrier();                                               \
+       _shared->vcpu_data[0].evtchn_upcall_mask = 0;                   \
+       __insn_barrier(); /* unmask then check (avoid races) */         \
+       if (__predict_false(_shared->vcpu_data[0].evtchn_upcall_pending)) \
+               hypervisor_force_callback();                            \
+} while (0)
+
+#define cli()                  __cli()
+#define sti()                  __sti()
+#define save_flags(x)          __save_flags(x)
+#define restore_flags(x)       __restore_flags(x)
+#define save_and_cli(x)        do {                                    \
+       __save_flags(x);                                        \
+       __cli();                                                \
+} while (/* CONSTCOND */ 0)
+#define save_and_sti(x)                __save_and_sti(x)
+
+#ifdef MULTIPROCESSOR
+#define __LOCK_PREFIX "lock; "
+#else
+#define __LOCK_PREFIX ""
+#endif
+
+static __inline__ uint32_t
+x86_atomic_xchg(uint32_t *ptr, unsigned long val)
+{
+       unsigned long result;
+
+        __asm __volatile("xchgl %0,%1"
+           :"=r" (result)
+           :"m" (*ptr), "0" (val)
+           :"memory");
+
+       return result;
+}
+
+static __inline__ int
+x86_atomic_test_and_clear_bit(volatile void *ptr, int bitno)
+{
+        int result;
+
+        __asm __volatile(__LOCK_PREFIX
+           "btrl %2,%1 ;"
+           "sbbl %0,%0"
+           :"=r" (result), "=m" (*(volatile uint32_t *)(ptr))
+           :"Ir" (bitno) : "memory");
+        return result;
+}
+
+static __inline__ int
+x86_atomic_test_and_set_bit(volatile void *ptr, int bitno)
+{
+        int result;
+
+        __asm __volatile(__LOCK_PREFIX
+           "btsl %2,%1 ;"
+           "sbbl %0,%0"
+           :"=r" (result), "=m" (*(volatile uint32_t *)(ptr))
+           :"Ir" (bitno) : "memory");
+        return result;
+}
+
+static __inline int
+x86_constant_test_bit(const volatile void *ptr, int bitno)
+{
+       return ((1UL << (bitno & 31)) &
+           (((const volatile uint32_t *) ptr)[bitno >> 5])) != 0;
+}
+
+static __inline int
+x86_variable_test_bit(const volatile void *ptr, int bitno)
+{
+       int result;
+    
+       __asm __volatile(
+               "btl %2,%1 ;"
+               "sbbl %0,%0"
+               :"=r" (result)
+               :"m" (*(volatile uint32_t *)(ptr)), "Ir" (bitno));
+       return result;
+}
+
+#define x86_atomic_test_bit(ptr, bitno) \
+       (__builtin_constant_p(bitno) ? \
+        x86_constant_test_bit((ptr),(bitno)) : \
+        x86_variable_test_bit((ptr),(bitno)))
+
+static __inline void
+x86_atomic_set_bit(volatile void *ptr, int bitno)
+{
+        __asm __volatile(__LOCK_PREFIX
+           "btsl %1,%0"
+           :"=m" (*(volatile uint32_t *)(ptr))
+           :"Ir" (bitno));
+}
+
+static __inline void
+x86_atomic_clear_bit(volatile void *ptr, int bitno)
+{
+        __asm __volatile(__LOCK_PREFIX
+           "btrl %1,%0"
+           :"=m" (*(volatile uint32_t *)(ptr))
+           :"Ir" (bitno));
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _OS_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h

new file mode 100644 (file)

index 0000000..2df026a
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h
@@ -0,0 +1,135 @@
+/*     $NetBSD: xenfunc.h,v 1.1.2.1 2004/05/22 15:59:31 he Exp $       */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _XEN_XENFUNC_H_
+#define _XEN_XENFUNC_H_
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+#include <machine/xenpmap.h>
+#include <machine/pte.h>
+
+#ifdef XENDEBUG_LOW
+#define        __PRINTK(x) printk x
+#else
+#define        __PRINTK(x)
+#endif
+
+void xen_set_ldt(vaddr_t, uint32_t);
+void xen_update_descriptor(union descriptor *, union descriptor *);
+
+static __inline void 
+invlpg(u_int addr)
+{
+       xpq_queue_invlpg(addr);
+       xpq_flush_queue();
+}  
+
+static __inline void
+lldt(u_short sel)
+{
+
+       /* __PRINTK(("ldt %x\n", IDXSELN(sel))); */
+       if (sel == GSEL(GLDT_SEL, SEL_KPL))
+               xen_set_ldt((vaddr_t)ldt, NLDT);
+       else
+               xen_set_ldt(cpu_info_primary.ci_gdt[IDXSELN(sel)].ld.ld_base,
+                   cpu_info_primary.ci_gdt[IDXSELN(sel)].ld.ld_entries);
+}
+
+static __inline void
+ltr(u_short sel)
+{
+       __PRINTK(("XXX ltr not supported\n"));
+}
+
+static __inline void
+lcr0(u_int val)
+{
+       __PRINTK(("XXX lcr0 not supported\n"));
+}
+
+static __inline u_int
+rcr0(void)
+{
+       __PRINTK(("XXX rcr0 not supported\n"));
+       return 0;
+}
+
+#define lcr3(_v) _lcr3((_v), __FILE__, __LINE__)
+static __inline void
+_lcr3(u_int val, char *file, int line)
+{
+/*     __PRINTK(("lcr3 %08x at %s:%d\n", val, file, line)); */
+       xpq_queue_pt_switch(xpmap_ptom(val) & PG_FRAME);
+       xpq_flush_queue();
+}
+
+static __inline void
+tlbflush(void)
+{
+       xpq_queue_tlb_flush();
+       xpq_flush_queue();
+}
+
+static __inline u_int
+rdr6(void)
+{
+       u_int val;
+
+       val = HYPERVISOR_get_debugreg(6);
+       return val;
+}
+
+static __inline void
+ldr6(u_int val)
+{
+
+       HYPERVISOR_set_debugreg(6, val);
+}
+
+static __inline void
+disable_intr(void)
+{
+       __cli();
+}
+
+static __inline void
+enable_intr(void)
+{
+       __sti();
+}
+
+#endif /* _XEN_XENFUNC_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h

new file mode 100644 (file)

index 0000000..f3c8c7f
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h
@@ -0,0 +1,193 @@
+/*     $NetBSD: xenpmap.h,v 1.1.2.1 2004/05/22 15:59:58 he Exp $       */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _XEN_XENPMAP_H_
+#define _XEN_XENPMAP_H_
+
+#define        INVALID_P2M_ENTRY       (~0UL)
+
+void xpq_queue_invlpg(vaddr_t);
+void xpq_queue_pde_update(pd_entry_t *, pd_entry_t);
+void xpq_queue_pte_update(pt_entry_t *, pt_entry_t);
+void xpq_queue_unchecked_pte_update(pt_entry_t *, pt_entry_t);
+void xpq_queue_pt_switch(paddr_t);
+void xpq_flush_queue(void);
+void xpq_queue_set_ldt(vaddr_t, uint32_t);
+void xpq_queue_tlb_flush(void);
+void xpq_queue_pin_table(paddr_t, int);
+void xpq_queue_unpin_table(paddr_t);
+
+extern paddr_t *xpmap_phys_to_machine_mapping;
+
+#define        XPQ_PIN_L1_TABLE 1
+#define        XPQ_PIN_L2_TABLE 2
+
+#ifndef XEN
+#define        PDE_GET(_pdp)                                           \
+       *(_pdp)
+#define PDE_SET(_pdp,_mapdp,_npde)                             \
+       *(_mapdp) = (_npde)
+#define PDE_CLEAR(_pdp,_mapdp)                                 \
+       *(_mapdp) = 0
+#define PTE_SET(_ptp,_maptp,_npte)                             \
+       *(_maptp) = (_npte)
+#define PTE_CLEAR(_ptp,_maptp)                                 \
+       *(_maptp) = 0
+#define PTE_ATOMIC_SET(_ptp,_maptp,_npte,_opte)                        \
+       (_opte) = x86_atomic_testset_ul((_maptp), (_npte))
+#define PTE_ATOMIC_CLEAR(_ptp,_maptp,_opte)                    \
+       (_opte) = x86_atomic_testset_ul((_maptp), 0)
+#define PDE_CLEARBITS(_pdp,_mapdp,_bits)                       \
+       *(_mapdp) &= ~(_bits)
+#define PTE_ATOMIC_CLEARBITS(_ptp,_maptp,_bits)                        \
+       x86_atomic_clearbits_l((_maptp), (_bits))
+#define PTE_SETBITS(_ptp,_maptp,_bits)                         \
+       *(_maptp) |= (_bits)
+#define PTE_ATOMIC_SETBITS(_ptp,_maptp,_bits)                  \
+       x86_atomic_setbits_l((_maptp), (_bits))
+#else
+paddr_t *xpmap_phys_to_machine_mapping;
+
+#define        PDE_GET(_pdp)                                           \
+       (pmap_valid_entry(*(_pdp)) ? xpmap_mtop(*(_pdp)) : *(_pdp))
+#define PDE_SET(_pdp,_mapdp,_npde) do {                                \
+       xpq_queue_pde_update((_mapdp), xpmap_ptom((_npde)));    \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PDE_CLEAR(_pdp,_mapdp) do {                            \
+       xpq_queue_pde_update((_mapdp), 0);                      \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define        PTE_GET(_ptp)                                           \
+       (pmap_valid_entry(*(_ptp)) ? xpmap_mtop(*(_ptp)) : *(_ptp))
+#define        PTE_GET_MA(_ptp)                                        \
+       *(_ptp)
+#define PTE_SET(_ptp,_maptp,_npte) do {                                \
+       xpq_queue_pte_update((_maptp), xpmap_ptom((_npte)));    \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PTE_SET_MA(_ptp,_maptp,_npte) do {                     \
+       xpq_queue_pte_update((_maptp), (_npte));                \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PTE_SET_MA_UNCHECKED(_ptp,_maptp,_npte) do {           \
+       xpq_queue_unchecked_pte_update((_maptp), (_npte));      \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PTE_CLEAR(_ptp,_maptp) do {                            \
+       xpq_queue_pte_update((_maptp), 0);                      \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_SET(_ptp,_maptp,_npte,_opte) do {           \
+       (_opte) = PTE_GET(_ptp);                                \
+       xpq_queue_pte_update((_maptp), xpmap_ptom((_npte)));    \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_SET_MA(_ptp,_maptp,_npte,_opte) do {                \
+       (_opte) = *(_ptp);                                      \
+       xpq_queue_pte_update((_maptp), (_npte));                \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_CLEAR(_ptp,_maptp,_opte) do {               \
+       (_opte) = PTE_GET(_ptp);                                \
+       xpq_queue_pte_update((_maptp), 0);                      \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_CLEAR_MA(_ptp,_maptp,_opte) do {            \
+       (_opte) = *(_ptp);                                      \
+       xpq_queue_pte_update((_maptp), 0);                      \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PDE_CLEARBITS(_pdp,_mapdp,_bits) do {                  \
+       xpq_queue_pte_update((_mapdp), *(_pdp) & ~((_bits) & ~PG_FRAME)); \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PTE_CLEARBITS(_ptp,_maptp,_bits) do {                  \
+       xpq_queue_pte_update((_maptp), *(_ptp) & ~((_bits) & ~PG_FRAME)); \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PDE_ATOMIC_CLEARBITS(_pdp,_mapdp,_bits) do {           \
+       xpq_queue_pde_update((_mapdp), *(_pdp) & ~((_bits) & ~PG_FRAME)); \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_CLEARBITS(_ptp,_maptp,_bits) do {           \
+       xpq_queue_pte_update((_maptp), *(_ptp) & ~((_bits) & ~PG_FRAME)); \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PTE_SETBITS(_ptp,_maptp,_bits) do {                    \
+       xpq_queue_pte_update((_maptp), *(_ptp) | ((_bits) & ~PG_FRAME)); \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PDE_ATOMIC_SETBITS(_pdp,_mapdp,_bits) do {             \
+       xpq_queue_pde_update((_mapdp), *(_pdp) | ((_bits) & ~PG_FRAME)); \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PTE_ATOMIC_SETBITS(_ptp,_maptp,_bits) do {             \
+       xpq_queue_pte_update((_maptp), *(_ptp) | ((_bits) & ~PG_FRAME)); \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define PDE_COPY(_dpdp,_madpdp,_spdp) do {                     \
+       xpq_queue_pde_update((_madpdp), *(_spdp));              \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+#define        PTE_UPDATES_FLUSH() do {                                \
+       xpq_flush_queue();                                      \
+} while (/*CONSTCOND*/0)
+
+#endif
+
+#define        XPMAP_OFFSET    (KERNTEXTOFF - KERNBASE_LOCORE)
+static __inline paddr_t
+xpmap_mtop(paddr_t mpa)
+{
+       return ((machine_to_phys_mapping[mpa >> PAGE_SHIFT] << PAGE_SHIFT) +
+           XPMAP_OFFSET) | (mpa & ~PG_FRAME);
+}
+
+static __inline paddr_t
+xpmap_ptom(paddr_t ppa)
+{
+       return (xpmap_phys_to_machine_mapping[(ppa -
+           XPMAP_OFFSET) >> PAGE_SHIFT] << PAGE_SHIFT)
+               | (ppa & ~PG_FRAME);
+}
+
+static __inline paddr_t
+xpmap_ptom_masked(paddr_t ppa)
+{
+       return (xpmap_phys_to_machine_mapping[(ppa -
+           XPMAP_OFFSET) >> PAGE_SHIFT] << PAGE_SHIFT);
+}
+
+#endif /* _XEN_XENPMAP_H_ */
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c b/netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c

new file mode 100644 (file)

index 0000000..dda715f
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c
@@ -0,0 +1,505 @@
+/*     $NetBSD: bus_space.c,v 1.2.2.1 2004/05/22 15:57:25 he Exp $     */
+/*     NetBSD: bus_space.c,v 1.2 2003/03/14 18:47:53 christos Exp      */
+
+/*-
+ * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
+ * Simulation Facility, NASA Ames Research Center.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *     This product includes software developed by the NetBSD
+ *     Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: bus_space.c,v 1.2.2.1 2004/05/22 15:57:25 he Exp $");
+
+#include "opt_xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/extent.h>
+
+#include <uvm/uvm_extern.h>
+
+#include <machine/bus.h>
+
+#include <dev/isa/isareg.h>
+#include <machine/isa_machdep.h>
+
+#include <machine/hypervisor.h>
+#include <machine/xenpmap.h>
+
+/*
+ * Extent maps to manage I/O and memory space.  Allocate
+ * storage for 8 regions in each, initially.  Later, ioport_malloc_safe
+ * will indicate that it's safe to use malloc() to dynamically allocate
+ * region descriptors.
+ *
+ * N.B. At least two regions are _always_ allocated from the iomem
+ * extent map; (0 -> ISA hole) and (end of ISA hole -> end of RAM).
+ *
+ * The extent maps are not static!  Machine-dependent ISA and EISA
+ * routines need access to them for bus address space allocation.
+ */
+static long ioport_ex_storage[EXTENT_FIXED_STORAGE_SIZE(8) / sizeof(long)];
+static long iomem_ex_storage[EXTENT_FIXED_STORAGE_SIZE(8) / sizeof(long)];
+struct extent *ioport_ex;
+struct extent *iomem_ex;
+static int ioport_malloc_safe;
+
+int    x86_mem_add_mapping __P((bus_addr_t, bus_size_t,
+           int, bus_space_handle_t *));
+
+void
+x86_bus_space_init()
+{
+       /*
+        * Initialize the I/O port and I/O mem extent maps.
+        * Note: we don't have to check the return value since
+        * creation of a fixed extent map will never fail (since
+        * descriptor storage has already been allocated).
+        *
+        * N.B. The iomem extent manages _all_ physical addresses
+        * on the machine.  When the amount of RAM is found, the two
+        * extents of RAM are allocated from the map (0 -> ISA hole
+        * and end of ISA hole -> end of RAM).
+        */
+       ioport_ex = extent_create("ioport", 0x0, 0xffff, M_DEVBUF,
+           (caddr_t)ioport_ex_storage, sizeof(ioport_ex_storage),
+           EX_NOCOALESCE|EX_NOWAIT);
+       iomem_ex = extent_create("iomem", 0x0, 0xffffffff, M_DEVBUF,
+           (caddr_t)iomem_ex_storage, sizeof(iomem_ex_storage),
+           EX_NOCOALESCE|EX_NOWAIT);
+
+       /* We are privileged guest os - should have IO privileges. */
+       if (xen_start_info.flags & SIF_PRIVILEGED) {
+               dom0_op_t op;
+               op.cmd = DOM0_IOPL;
+               op.u.iopl.domain = DOMID_SELF;
+               op.u.iopl.iopl = 1;
+               if (HYPERVISOR_dom0_op(&op) != 0)
+                       panic("Unable to obtain IOPL, "
+                           "despite being SIF_PRIVILEGED");
+       }
+}
+
+void
+x86_bus_space_mallocok()
+{
+
+       ioport_malloc_safe = 1;
+}
+
+int
+x86_memio_map(t, bpa, size, flags, bshp)
+       bus_space_tag_t t;
+       bus_addr_t bpa;
+       bus_size_t size;
+       int flags;
+       bus_space_handle_t *bshp;
+{
+       int error;
+       struct extent *ex;
+
+       /*
+        * Pick the appropriate extent map.
+        */
+       if (t == X86_BUS_SPACE_IO) {
+               if (flags & BUS_SPACE_MAP_LINEAR)
+                       return (EOPNOTSUPP);
+               ex = ioport_ex;
+       } else if (t == X86_BUS_SPACE_MEM)
+               ex = iomem_ex;
+       else
+               panic("x86_memio_map: bad bus space tag");
+
+       /*
+        * Before we go any further, let's make sure that this
+        * region is available.
+        */
+       error = extent_alloc_region(ex, bpa, size,
+           EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0));
+       if (error)
+               return (error);
+
+       /*
+        * For I/O space, that's all she wrote.
+        */
+       if (t == X86_BUS_SPACE_IO) {
+               *bshp = bpa;
+               return (0);
+       }
+
+       /*
+        * For memory space, map the bus physical address to
+        * a kernel virtual address.
+        */
+       error = x86_mem_add_mapping(bpa, size,
+               (flags & BUS_SPACE_MAP_CACHEABLE) != 0, bshp);
+       if (error) {
+               if (extent_free(ex, bpa, size, EX_NOWAIT |
+                   (ioport_malloc_safe ? EX_MALLOCOK : 0))) {
+                       printf("x86_memio_map: pa 0x%lx, size 0x%lx\n",
+                           bpa, size);
+                       printf("x86_memio_map: can't free region\n");
+               }
+       }
+
+       return (error);
+}
+
+int
+_x86_memio_map(t, bpa, size, flags, bshp)
+       bus_space_tag_t t;
+       bus_addr_t bpa;
+       bus_size_t size;
+       int flags;
+       bus_space_handle_t *bshp;
+{
+
+       /*
+        * For I/O space, just fill in the handle.
+        */
+       if (t == X86_BUS_SPACE_IO) {
+               if (flags & BUS_SPACE_MAP_LINEAR)
+                       return (EOPNOTSUPP);
+               *bshp = bpa;
+               return (0);
+       }
+
+       /*
+        * For memory space, map the bus physical address to
+        * a kernel virtual address.
+        */
+       return (x86_mem_add_mapping(bpa, size,
+           (flags & BUS_SPACE_MAP_CACHEABLE) != 0, bshp));
+}
+
+int
+x86_memio_alloc(t, rstart, rend, size, alignment, boundary, flags,
+    bpap, bshp)
+       bus_space_tag_t t;
+       bus_addr_t rstart, rend;
+       bus_size_t size, alignment, boundary;
+       int flags;
+       bus_addr_t *bpap;
+       bus_space_handle_t *bshp;
+{
+       struct extent *ex;
+       u_long bpa;
+       int error;
+
+       /*
+        * Pick the appropriate extent map.
+        */
+       if (t == X86_BUS_SPACE_IO) {
+               if (flags & BUS_SPACE_MAP_LINEAR)
+                       return (EOPNOTSUPP);
+               ex = ioport_ex;
+       } else if (t == X86_BUS_SPACE_MEM)
+               ex = iomem_ex;
+       else
+               panic("x86_memio_alloc: bad bus space tag");
+
+       /*
+        * Sanity check the allocation against the extent's boundaries.
+        */
+       if (rstart < ex->ex_start || rend > ex->ex_end)
+               panic("x86_memio_alloc: bad region start/end");
+
+       /*
+        * Do the requested allocation.
+        */
+       error = extent_alloc_subregion(ex, rstart, rend, size, alignment,
+           boundary,
+           EX_FAST | EX_NOWAIT | (ioport_malloc_safe ?  EX_MALLOCOK : 0),
+           &bpa);
+
+       if (error)
+               return (error);
+
+       /*
+        * For I/O space, that's all she wrote.
+        */
+       if (t == X86_BUS_SPACE_IO) {
+               *bshp = *bpap = bpa;
+               return (0);
+       }
+
+       /*
+        * For memory space, map the bus physical address to
+        * a kernel virtual address.
+        */
+       error = x86_mem_add_mapping(bpa, size,
+           (flags & BUS_SPACE_MAP_CACHEABLE) != 0, bshp);
+       if (error) {
+               if (extent_free(iomem_ex, bpa, size, EX_NOWAIT |
+                   (ioport_malloc_safe ? EX_MALLOCOK : 0))) {
+                       printf("x86_memio_alloc: pa 0x%lx, size 0x%lx\n",
+                           bpa, size);
+                       printf("x86_memio_alloc: can't free region\n");
+               }
+       }
+
+       *bpap = bpa;
+
+       return (error);
+}
+
+int
+x86_mem_add_mapping(bpa, size, cacheable, bshp)
+       bus_addr_t bpa;
+       bus_size_t size;
+       int cacheable;
+       bus_space_handle_t *bshp;
+{
+       u_long pa, endpa;
+       vaddr_t va;
+       pt_entry_t *pte;
+       pt_entry_t *maptp;
+       int32_t cpumask = 0;
+
+       pa = x86_trunc_page(bpa);
+       endpa = x86_round_page(bpa + size);
+
+#ifdef DIAGNOSTIC
+       if (endpa <= pa)
+               panic("x86_mem_add_mapping: overflow");
+#endif
+
+       if (bpa >= IOM_BEGIN && (bpa + size) <= IOM_END) {
+               va = (vaddr_t)ISA_HOLE_VADDR(pa);
+       } else {
+               va = uvm_km_valloc(kernel_map, endpa - pa);
+               if (va == 0)
+                       return (ENOMEM);
+       }
+
+       *bshp = (bus_space_handle_t)(va + (bpa & PGOFSET));
+
+       for (; pa < endpa; pa += PAGE_SIZE, va += PAGE_SIZE) {
+               pmap_kenter_pa(va, pa, VM_PROT_READ | VM_PROT_WRITE);
+
+               /*
+                * PG_N doesn't exist on 386's, so we assume that
+                * the mainboard has wired up device space non-cacheable
+                * on those machines.
+                *
+                * Note that it's not necessary to use atomic ops to
+                * fiddle with the PTE here, because we don't care
+                * about mod/ref information.
+                *
+                * XXX should hand this bit to pmap_kenter_pa to
+                * save the extra invalidate!
+                *
+                * XXX extreme paranoia suggests tlb shootdown belongs here.
+                */
+               if (pmap_cpu_has_pg_n()) {
+                       pte = kvtopte(va);
+                       maptp = (pt_entry_t *)vtomach((vaddr_t)pte);
+                       if (cacheable)
+                               PTE_CLEARBITS(pte, maptp, PG_N);
+                       else
+                               PTE_SETBITS(pte, maptp, PG_N);
+                       pmap_tlb_shootdown(pmap_kernel(), va, *pte,
+                           &cpumask);
+               }
+       }
+
+       pmap_tlb_shootnow(cpumask);
+       pmap_update(pmap_kernel());
+
+       return 0;
+}
+
+/*
+ * void _x86_memio_unmap(bus_space_tag bst, bus_space_handle bsh,
+ *                        bus_size_t size, bus_addr_t *adrp)
+ *
+ *   This function unmaps memory- or io-space mapped by the function
+ *   _x86_memio_map().  This function works nearly as same as
+ *   x86_memio_unmap(), but this function does not ask kernel
+ *   built-in extents and returns physical address of the bus space,
+ *   for the convenience of the extra extent manager.
+ */
+void
+_x86_memio_unmap(t, bsh, size, adrp)
+       bus_space_tag_t t;
+       bus_space_handle_t bsh;
+       bus_size_t size;
+       bus_addr_t *adrp;
+{
+       u_long va, endva;
+       bus_addr_t bpa;
+
+       /*
+        * Find the correct extent and bus physical address.
+        */
+       if (t == X86_BUS_SPACE_IO) {
+               bpa = bsh;
+       } else if (t == X86_BUS_SPACE_MEM) {
+               if (bsh >= atdevbase && (bsh + size) <= (atdevbase + IOM_SIZE)) {
+                       bpa = (bus_addr_t)ISA_PHYSADDR(bsh);
+               } else {
+
+                       va = x86_trunc_page(bsh);
+                       endva = x86_round_page(bsh + size);
+
+#ifdef DIAGNOSTIC
+                       if (endva <= va) {
+                               panic("_x86_memio_unmap: overflow");
+                       }
+#endif
+
+#if __NetBSD_Version__ > 104050000
+                       if (pmap_extract(pmap_kernel(), va, &bpa) == FALSE) {
+                               panic("_x86_memio_unmap:"
+                                   " wrong virtual address");
+                       }
+                       bpa += (bsh & PGOFSET);
+#else
+                       bpa = pmap_extract(pmap_kernel(), va) + (bsh & PGOFSET);
+#endif
+
+                       pmap_kremove(va, endva - va);
+                       /*
+                        * Free the kernel virtual mapping.
+                        */
+                       uvm_km_free(kernel_map, va, endva - va);
+               }
+       } else {
+               panic("_x86_memio_unmap: bad bus space tag");
+       }
+
+       if (adrp != NULL) {
+               *adrp = bpa;
+       }
+}
+
+void
+x86_memio_unmap(t, bsh, size)
+       bus_space_tag_t t;
+       bus_space_handle_t bsh;
+       bus_size_t size;
+{
+       struct extent *ex;
+       u_long va, endva;
+       bus_addr_t bpa;
+
+       /*
+        * Find the correct extent and bus physical address.
+        */
+       if (t == X86_BUS_SPACE_IO) {
+               ex = ioport_ex;
+               bpa = bsh;
+       } else if (t == X86_BUS_SPACE_MEM) {
+               ex = iomem_ex;
+
+               if (bsh >= atdevbase &&
+                   (bsh + size) <= (atdevbase + IOM_SIZE)) {
+                       bpa = (bus_addr_t)ISA_PHYSADDR(bsh);
+                       goto ok;
+               }
+
+               va = x86_trunc_page(bsh);
+               endva = x86_round_page(bsh + size);
+
+#ifdef DIAGNOSTIC
+               if (endva <= va)
+                       panic("x86_memio_unmap: overflow");
+#endif
+
+               (void) pmap_extract(pmap_kernel(), va, &bpa);
+               bpa += (bsh & PGOFSET);
+
+               pmap_kremove(va, endva - va);
+               /*
+                * Free the kernel virtual mapping.
+                */
+               uvm_km_free(kernel_map, va, endva - va);
+       } else
+               panic("x86_memio_unmap: bad bus space tag");
+
+ok:
+       if (extent_free(ex, bpa, size,
+           EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0))) {
+               printf("x86_memio_unmap: %s 0x%lx, size 0x%lx\n",
+                   (t == X86_BUS_SPACE_IO) ? "port" : "pa", bpa, size);
+               printf("x86_memio_unmap: can't free region\n");
+       }
+}
+
+void
+x86_memio_free(t, bsh, size)
+       bus_space_tag_t t;
+       bus_space_handle_t bsh;
+       bus_size_t size;
+{
+
+       /* x86_memio_unmap() does all that we need to do. */
+       x86_memio_unmap(t, bsh, size);
+}
+
+int
+x86_memio_subregion(t, bsh, offset, size, nbshp)
+       bus_space_tag_t t;
+       bus_space_handle_t bsh;
+       bus_size_t offset, size;
+       bus_space_handle_t *nbshp;
+{
+
+       *nbshp = bsh + offset;
+       return (0);
+}
+
+paddr_t
+x86_memio_mmap(t, addr, off, prot, flags)
+       bus_space_tag_t t;
+       bus_addr_t addr;
+       off_t off;
+       int prot;
+       int flags;
+{
+
+       /* Can't mmap I/O space. */
+       if (t == X86_BUS_SPACE_IO)
+               return (-1);
+
+       /*
+        * "addr" is the base address of the device we're mapping.
+        * "off" is the offset into that device.
+        *
+        * Note we are called for each "page" in the device that
+        * the upper layers want to map.
+        */
+       return (x86_btop(addr + off));
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c

new file mode 100644 (file)

index 0000000..6783f69
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c
@@ -0,0 +1,234 @@
+/*     $NetBSD: clock.c,v 1.1.2.2 2004/07/17 16:43:56 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_xen.h"
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: clock.c,v 1.1.2.2 2004/07/17 16:43:56 he Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/device.h>
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+#include <machine/cpu_counter.h>
+
+#include <dev/clock_subr.h>
+
+#include "config_time.h"               /* for CONFIG_TIME */
+
+static int xen_timer_handler(void *, struct trapframe *);
+
+/* These are peridically updated in shared_info, and then copied here. */
+static unsigned long shadow_tsc_stamp;
+static u_int64_t shadow_system_time;
+static unsigned long shadow_time_version;
+static struct timeval shadow_tv;
+
+static int timeset;
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.  Must be called at splclock.
+ */
+static void
+get_time_values_from_xen(void)
+{
+       do {
+               shadow_time_version = HYPERVISOR_shared_info->time_version2;
+               __insn_barrier();
+               shadow_tv.tv_sec = HYPERVISOR_shared_info->wc_sec;
+               shadow_tv.tv_usec = HYPERVISOR_shared_info->wc_usec;
+               shadow_tsc_stamp = HYPERVISOR_shared_info->tsc_timestamp;
+               shadow_system_time = HYPERVISOR_shared_info->system_time;
+               __insn_barrier();
+       } while (shadow_time_version != HYPERVISOR_shared_info->time_version1);
+}
+
+void
+inittodr(time_t base)
+{
+       int s;
+
+       /*
+        * if the file system time is more than a year older than the
+        * kernel, warn and then set the base time to the CONFIG_TIME.
+        */
+       if (base && base < (CONFIG_TIME-SECYR)) {
+               printf("WARNING: preposterous time in file system\n");
+               base = CONFIG_TIME;
+       }
+
+       s = splclock();
+       get_time_values_from_xen();
+       splx(s);
+
+       time.tv_usec = shadow_tv.tv_usec;
+       time.tv_sec = shadow_tv.tv_sec + rtc_offset * 60;
+#ifdef DEBUG_CLOCK
+       printf("readclock: %ld (%ld)\n", time.tv_sec, base);
+#endif
+       if (base != 0 && base < time.tv_sec - 5*SECYR)
+               printf("WARNING: file system time much less than clock time\n");
+       else if (base > time.tv_sec + 5*SECYR) {
+               printf("WARNING: clock time much less than file system time\n");
+               printf("WARNING: using file system time\n");
+               goto fstime;
+       }
+
+       timeset = 1;
+       return;
+
+fstime:
+       timeset = 1;
+       time.tv_sec = base;
+       printf("WARNING: CHECK AND RESET THE DATE!\n");
+}
+
+void
+resettodr()
+{
+#ifdef DOM0OPS
+       dom0_op_t op;
+       int s;
+#endif
+#ifdef DEBUG_CLOCK
+       struct clock_ymdhms dt;
+#endif
+
+       /*
+        * We might have been called by boot() due to a crash early
+        * on.  Don't reset the clock chip in this case.
+        */
+       if (!timeset)
+               return;
+
+#ifdef DEBUG_CLOCK
+       clock_secs_to_ymdhms(time.tv_sec - rtc_offset * 60, &dt);
+
+       printf("setclock: %d/%d/%d %02d:%02d:%02d\n", dt.dt_year,
+           dt.dt_mon, dt.dt_day, dt.dt_hour, dt.dt_min, dt.dt_sec);
+#endif
+#ifdef DOM0OPS
+       if (xen_start_info.dom_id == 0) {
+               s = splclock();
+
+               op.cmd = DOM0_SETTIME;
+               op.u.settime.secs        = time.tv_sec - rtc_offset * 60;
+               op.u.settime.usecs       = time.tv_usec;
+               op.u.settime.system_time = shadow_system_time;
+               HYPERVISOR_dom0_op(&op);
+
+               splx(s);
+       }
+#endif
+}
+
+void
+startrtclock()
+{
+
+}
+
+/*
+ * Wait approximately `n' microseconds.
+ */
+void
+xen_delay(int n)
+{
+       u_int64_t when;
+
+       get_time_values_from_xen();
+       when = shadow_system_time + n * 1000;
+       while (shadow_system_time < when)
+               get_time_values_from_xen();
+}
+
+void
+xen_microtime(struct timeval *tv)
+{
+
+       *tv = time;
+}
+
+void
+xen_initclocks()
+{
+       int irq = bind_virq_to_irq(VIRQ_TIMER);
+
+       event_set_handler(irq, (int (*)(void *))xen_timer_handler,
+           NULL, IPL_CLOCK);
+       hypervisor_enable_irq(irq);
+}
+
+static int
+xen_timer_handler(void *arg, struct trapframe *regs)
+{
+#if defined(I586_CPU) || defined(I686_CPU)
+       static int microset_iter; /* call cc_microset once/sec */
+       struct cpu_info *ci = curcpu();
+       
+       /*
+        * If we have a cycle counter, do the microset thing.
+        */
+       if (ci->ci_feature_flags & CPUID_TSC) {
+               if (
+#if defined(MULTIPROCESSOR)
+                   CPU_IS_PRIMARY(ci) &&
+#endif
+                   (microset_iter--) == 0) {
+                       microset_iter = hz - 1;
+#if defined(MULTIPROCESSOR)
+                       x86_broadcast_ipi(X86_IPI_MICROSET);
+#endif
+                       cc_microset_time = time;
+                       cc_microset(ci);
+               }
+       }
+#endif
+
+       get_time_values_from_xen();
+
+       hardclock((struct clockframe *)regs);
+
+       return 0;
+}
+
+void
+setstatclockrate(int arg)
+{
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c

new file mode 100644 (file)

index 0000000..0f5a9fe
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c
@@ -0,0 +1,226 @@
+/* $NetBSD: hypervisor.c,v 1.7.2.1 2004/05/22 15:58:54 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: hypervisor.c,v 1.7.2.1 2004/05/22 15:58:54 he Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/device.h>
+
+#include "xencons.h"
+#include "xennet.h"
+#include "xbd.h"
+#include "xenkbc.h"
+#include "vga_xen.h"
+#include "npx.h"
+
+#include "opt_xen.h"
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+
+#ifdef DOM0OPS
+#include <sys/dirent.h>
+#include <sys/stat.h>
+#include <sys/tree.h>
+#include <sys/vnode.h>
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/kernfs/kernfs.h>
+#include <machine/kernfs_machdep.h>
+#endif
+
+#if NXENNET > 0
+#include <net/if.h>
+#include <net/if_ether.h>
+#include <net/if_media.h>
+#include <machine/if_xennetvar.h>
+#endif
+
+#if NXBD > 0
+#include <sys/buf.h>
+#include <sys/disk.h>
+#include <dev/dkvar.h>
+#include <machine/xbdvar.h>
+#endif
+
+#if NXENKBC > 0
+#include <dev/pckbport/pckbportvar.h>
+#include <machine/xenkbcvar.h>
+#endif
+
+#if NVGA_XEN > 0
+#include <machine/bus.h>
+#include <machine/vga_xenvar.h>
+#endif
+
+int    hypervisor_match(struct device *, struct cfdata *, void *);
+void   hypervisor_attach(struct device *, struct device *, void *);
+
+CFATTACH_DECL(hypervisor, sizeof(struct device),
+    hypervisor_match, hypervisor_attach, NULL, NULL);
+
+int    hypervisor_print(void *, const char *);
+
+union hypervisor_attach_cookie {
+       const char *hac_device;         /* first elem of all */
+#if NXENKBC > 0
+       struct xenkbc_attach_args hac_xenkbc;
+#endif
+#if NVGA_XEN > 0
+       struct xen_vga_attach_args hac_vga_xen;
+#endif
+#if NXENCONS > 0
+       struct xencons_attach_args hac_xencons;
+#endif
+#if NXENNET > 0
+       struct xennet_attach_args hac_xennet;
+#endif
+#if NXBD > 0
+       struct xbd_attach_args hac_xbd;
+#endif
+#if NNPX > 0
+       struct xen_npx_attach_args hac_xennpx;
+#endif
+};
+
+
+/*
+ * Probe for the hypervisor; always succeeds.
+ */
+int
+hypervisor_match(parent, match, aux)
+       struct device *parent;
+       struct cfdata *match;
+       void *aux;
+{
+       struct hypervisor_attach_args *haa = aux;
+
+       if (strcmp(haa->haa_busname, "hypervisor") == 0)
+               return 1;
+       return 0;
+}
+
+/*
+ * Attach the hypervisor.
+ */
+void
+hypervisor_attach(parent, self, aux)
+       struct device *parent, *self;
+       void *aux;
+{
+       union hypervisor_attach_cookie hac;
+
+       printf("\n");
+
+       init_events();
+
+#if NXENKBC > 0
+       hac.hac_xenkbc.xa_device = "xenkbc";
+       config_found(self, &hac.hac_xenkbc, hypervisor_print);
+#endif
+
+#if NVGA_XEN > 0
+       hac.hac_vga_xen.xa_device = "vga_xen";
+       hac.hac_vga_xen.xa_iot = X86_BUS_SPACE_IO;
+       hac.hac_vga_xen.xa_memt = X86_BUS_SPACE_MEM;
+       config_found(self, &hac.hac_vga_xen, hypervisor_print);
+#endif
+
+#if NXENCONS > 0
+       hac.hac_xencons.xa_device = "xencons";
+       config_found(self, &hac.hac_xencons, hypervisor_print);
+#endif
+#if NXENNET > 0
+       hac.hac_xennet.xa_device = "xennet";
+       xennet_scan(self, &hac.hac_xennet, hypervisor_print);
+#endif
+#if NXBD > 0
+       hac.hac_xbd.xa_device = "xbd";
+       xbd_scan(self, &hac.hac_xbd, hypervisor_print);
+#endif
+#if NNPX > 0
+       hac.hac_xennpx.xa_device = "npx";
+       config_found(self, &hac.hac_xennpx, hypervisor_print);
+#endif
+#ifdef DOM0OPS
+       if (xen_start_info.flags & SIF_PRIVILEGED) {
+               xenkernfs_init();
+               xenprivcmd_init();
+               xenmachmem_init();
+               xenvfr_init();
+       }
+#endif
+}
+
+int
+hypervisor_print(aux, parent)
+       void *aux;
+       const char *parent;
+{
+       union hypervisor_attach_cookie *hac = aux;
+
+       if (parent)
+               aprint_normal("%s at %s", hac->hac_device, parent);
+       return (UNCONF);
+}
+
+void
+hypervisor_notify_via_evtchn(unsigned int port)
+{
+       evtchn_op_t op;
+
+       op.cmd = EVTCHNOP_send;
+       op.u.send.local_port = port;
+       (void)HYPERVISOR_event_channel_op(&op);
+}
+
+#ifdef DOM0OPS
+
+#define DIR_MODE       (S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
+
+kernfs_parentdir_t *kernxen_pkt;
+
+void
+xenkernfs_init()
+{
+       kernfs_entry_t *dkt;
+
+       KERNFS_ALLOCENTRY(dkt, M_TEMP, M_WAITOK);
+       KERNFS_INITENTRY(dkt, DT_DIR, "xen", NULL, KFSsubdir, VDIR, DIR_MODE);
+       kernfs_addentry(NULL, dkt);
+       kernxen_pkt = KERNFS_ENTOPARENTDIR(dkt);
+}
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c

new file mode 100644 (file)

index 0000000..51219a9
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c
@@ -0,0 +1,1241 @@
+/*     $NetBSD: if_xennet.c,v 1.1.2.1 2004/05/22 15:58:29 he Exp $     */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: if_xennet.c,v 1.1.2.1 2004/05/22 15:58:29 he Exp $");
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/syslog.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/device.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#if NRND > 0
+#include <sys/rnd.h>
+#endif
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_dl.h>
+#include <net/if_ether.h>
+
+#ifdef mediacode
+#include <net/if_media.h>
+#endif
+
+#ifdef INET
+#include <netinet/in.h>
+#include <netinet/if_inarp.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#endif
+
+#include <nfs/rpcv2.h>
+
+#include <nfs/nfsproto.h>
+#include <nfs/nfs.h>
+#include <nfs/nfsmount.h>
+#include <nfs/nfsdiskless.h>
+
+#include "bpfilter.h"
+#if NBPFILTER > 0
+#include <net/bpf.h>
+#include <net/bpfdesc.h>
+#endif
+
+#include <uvm/uvm_extern.h>
+#include <uvm/uvm_page.h>
+
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/evtchn.h>
+#include <machine/ctrl_if.h>
+
+#include <machine/if_xennetvar.h>
+
+#ifdef DEBUG
+#define XENNET_DEBUG
+#endif
+#if defined(XENNET_DEBUG) && !defined(DEBUG)
+#define DEBUG
+#endif
+/* #define XENNET_DEBUG_DUMP */
+
+#ifdef XENNET_DEBUG
+#define XEDB_FOLLOW    0x01
+#define XEDB_INIT      0x02
+#define XEDB_EVENT     0x04
+#define XEDB_MBUF      0x08
+#define XEDB_MEM       0x10
+int xennet_debug = 0x0;
+#define DPRINTF(x) if (xennet_debug) printf x;
+#define DPRINTFN(n,x) if (xennet_debug & (n)) printf x;
+#else
+#define DPRINTF(x)
+#define DPRINTFN(n,x)
+#endif
+#define PRINTF(x) printf x;
+
+#ifdef XENNET_DEBUG_DUMP
+static void xennet_hex_dump(unsigned char *, size_t, char *, int);
+#endif
+
+int xennet_match (struct device *, struct cfdata *, void *);
+void xennet_attach (struct device *, struct device *, void *);
+static void xennet_ctrlif_rx(ctrl_msg_t *, unsigned long);
+static void xennet_driver_status_change(netif_fe_driver_status_changed_t *);
+static void xennet_status_change(netif_fe_interface_status_changed_t *);
+static void xennet_tx_mbuf_free(struct mbuf *, caddr_t, size_t, void *);
+static void xennet_rx_mbuf_free(struct mbuf *, caddr_t, size_t, void *);
+static int xen_network_handler(void *);
+static void network_tx_buf_gc(struct xennet_softc *);
+static void network_alloc_rx_buffers(struct xennet_softc *);
+static void network_alloc_tx_buffers(struct xennet_softc *);
+void xennet_init(struct xennet_softc *);
+void xennet_reset(struct xennet_softc *);
+#ifdef mediacode
+static int xennet_mediachange (struct ifnet *);
+static void xennet_mediastatus(struct ifnet *, struct ifmediareq *);
+#endif
+
+CFATTACH_DECL(xennet, sizeof(struct xennet_softc),
+    xennet_match, xennet_attach, NULL, NULL);
+
+#define TX_MAX_ENTRIES (NETIF_TX_RING_SIZE - 2)
+#define RX_MAX_ENTRIES (NETIF_RX_RING_SIZE - 2)
+#define TX_ENTRIES 128
+#define RX_ENTRIES 128
+
+static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
+static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
+static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];
+
+/** Network interface info. */
+struct xennet_ctrl {
+       /** Number of interfaces. */
+       int xc_interfaces;
+       /** Number of connected interfaces. */
+       int xc_connected;
+       /** Error code. */
+       int xc_err;
+
+       cfprint_t xc_cfprint;
+       struct device *xc_parent;
+};
+
+static struct xennet_ctrl netctrl = { -1, 0, 0 };
+
+#ifdef mediacode
+static int xennet_media[] = {
+       IFM_ETHER|IFM_AUTO,
+};
+static int nxennet_media = (sizeof(xennet_media)/sizeof(xennet_media[0]));
+#endif
+
+
+int
+xennet_scan(struct device *self, struct xennet_attach_args *xneta,
+    cfprint_t print)
+{
+       ctrl_msg_t cmsg;
+       netif_fe_driver_status_changed_t st;
+       int err = 0;
+
+       if ((xen_start_info.flags & SIF_INITDOMAIN) ||
+           (xen_start_info.flags & SIF_NET_BE_DOMAIN))
+               return 0;
+
+       netctrl.xc_parent = self;
+       netctrl.xc_cfprint = print;
+
+       printf("Initialising Xen virtual ethernet frontend driver.\n");
+
+       (void)ctrl_if_register_receiver(CMSG_NETIF_FE, xennet_ctrlif_rx,
+           CALLBACK_IN_BLOCKING_CONTEXT);
+
+       /* Send a driver-UP notification to the domain controller. */
+       cmsg.type      = CMSG_NETIF_FE;
+       cmsg.subtype   = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED;
+       cmsg.length    = sizeof(netif_fe_driver_status_changed_t);
+       st.status      = NETIF_DRIVER_STATUS_UP;
+       st.max_handle  = 0;
+       memcpy(cmsg.msg, &st, sizeof(st));
+       ctrl_if_send_message_block(&cmsg, NULL, 0, 0);
+
+#if 0
+       err = xennet_wait_for_interfaces();
+       if (err)
+               ctrl_if_unregister_receiver(CMSG_NETIF_FE, xennet_ctrlif_rx);
+#endif
+
+       return err;
+}
+
+int
+xennet_match(struct device *parent, struct cfdata *match, void *aux)
+{
+       struct xennet_attach_args *xa = (struct xennet_attach_args *)aux;
+
+       if (strcmp(xa->xa_device, "xennet") == 0)
+               return 1;
+       return 0;
+}
+
+void
+xennet_attach(struct device *parent, struct device *self, void *aux)
+{
+       struct xennet_attach_args *xneta = (struct xennet_attach_args *)aux;
+       struct xennet_softc *sc = (struct xennet_softc *)self;
+       struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+       int idx;
+
+       aprint_normal(": Xen Virtual Network Interface\n");
+
+       sc->sc_ifno = xneta->xa_handle;
+
+       /* Initialize ifnet structure. */
+       memcpy(ifp->if_xname, sc->sc_dev.dv_xname, IFNAMSIZ);
+       ifp->if_softc = sc;
+       ifp->if_start = xennet_start;
+       ifp->if_ioctl = xennet_ioctl;
+       ifp->if_watchdog = xennet_watchdog;
+       ifp->if_flags = IFF_BROADCAST | IFF_NOTRAILERS;
+
+#ifdef mediacode
+       ifmedia_init(&sc->sc_media, 0, xennet_mediachange,
+           xennet_mediastatus);
+       for (idx = 0; idx < nxennet_media; idx++)
+               ifmedia_add(&sc->sc_media, xennet_media[idx], 0, NULL);
+       ifmedia_set(&sc->sc_media, xennet_media[0]);
+#endif
+
+       for (idx = 0; idx < NETIF_TX_RING_SIZE; idx++)
+               sc->sc_tx_bufa[idx].xb_next = idx + 1;
+       for (idx = 0; idx < NETIF_RX_RING_SIZE; idx++)
+               sc->sc_rx_bufa[idx].xb_next = idx + 1;
+}
+
+static struct xennet_softc *
+find_device(int handle)
+{
+       struct device *dv;
+       struct xennet_softc *xs = NULL;
+
+       for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+               if (dv->dv_cfattach == NULL ||
+                   dv->dv_cfattach->ca_attach != xennet_attach)
+                       continue;
+               xs = (struct xennet_softc *)dv;
+               if (xs->sc_ifno == handle)
+                       break;
+       }
+       return xs;
+}
+
+static void
+xennet_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+       int respond = 1;
+
+       switch (msg->subtype) {
+       case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED:
+               if (msg->length != sizeof(netif_fe_interface_status_changed_t))
+                       goto error;
+               xennet_status_change(
+                       (netif_fe_interface_status_changed_t *)&msg->msg[0]);
+               break;
+
+       case CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
+               if (msg->length != sizeof(netif_fe_driver_status_changed_t))
+                       goto error;
+               xennet_driver_status_change(
+                       (netif_fe_driver_status_changed_t *)&msg->msg[0]);
+               break;
+
+       error:
+       default:
+               msg->length = 0;
+               break;
+       }
+
+       if (respond)
+               ctrl_if_send_response(msg);
+}
+
+static void
+xennet_driver_status_change(netif_fe_driver_status_changed_t *status)
+{
+       struct xennet_attach_args xneta;
+       int i;
+
+       DPRINTFN(XEDB_EVENT, ("> max_handle=%d\n", status->max_handle));
+
+       /* XXX FIXME: Abuse of 'max_handle' as interface count. */
+       netctrl.xc_interfaces = status->max_handle;
+       netctrl.xc_connected = 0;
+
+       xneta.xa_device = "xennet";
+
+       for (i = 0; i < netctrl.xc_interfaces; i++) {
+               xneta.xa_handle = i;
+               config_found(netctrl.xc_parent, &xneta, netctrl.xc_cfprint);
+       }
+}
+
+static void
+xennet_status_change(netif_fe_interface_status_changed_t *status)
+{
+       ctrl_msg_t cmsg;
+       netif_fe_interface_connect_t up;
+       struct xennet_softc *sc;
+       struct ifnet *ifp;
+       struct vm_page *pg_tx, *pg_rx;
+
+       DPRINTFN(XEDB_EVENT, (">\n"));
+       DPRINTFN(XEDB_EVENT, ("> status=%d handle=%d mac=%02x:%02x:%02x:%02x:%02x:%02x\n",
+           status->status,
+           status->handle,
+           status->mac[0], status->mac[1], status->mac[2],
+           status->mac[3], status->mac[4], status->mac[5]));
+
+       if (netctrl.xc_interfaces <= 0) {
+               printf("Status change: no interfaces\n");
+               return;
+       }
+
+       sc = find_device(status->handle);
+       if (sc == NULL) {
+               printf("Status change: invalid netif handle %u\n",
+                   status->handle);
+               return;
+       }
+       ifp = &sc->sc_ethercom.ec_if;
+
+       switch (status->status) {
+       case NETIF_INTERFACE_STATUS_DESTROYED:
+               printf("Unexpected netif-DESTROYED message in state %d\n",
+                   sc->sc_backend_state);
+               break;
+
+       case NETIF_INTERFACE_STATUS_DISCONNECTED:
+#if 0
+               if (sc->sc_backend_state != BEST_CLOSED) {
+                       printk("Unexpected netif-DISCONNECTED message"
+                           " in state %d\n", sc->sc_backend_state);
+                       printk("Attempting to reconnect network interface\n");
+
+                       /* Begin interface recovery.
+                        *
+                        * NB. Whilst we're recovering, we turn the
+                        * carrier state off.  We take measures to
+                        * ensure that this device isn't used for
+                        * anything.  We also stop the queue for this
+                        * device.  Various different approaches
+                        * (e.g. continuing to buffer packets) have
+                        * been tested but don't appear to improve the
+                        * overall impact on TCP connections.
+                        *
+                        * TODO: (MAW) Change the Xend<->Guest
+                        * protocol so that a recovery is initiated by
+                        * a special "RESET" message - disconnect
+                        * could just mean we're not allowed to use
+                        * this interface any more.
+                        */
+
+                       /* Stop old i/f to prevent errors whilst we
+                        * rebuild the state. */
+                       spin_lock_irq(&np->tx_lock);
+                       spin_lock(&np->rx_lock);
+                       netif_stop_queue(dev);
+                       np->backend_state = BEST_DISCONNECTED;
+                       spin_unlock(&np->rx_lock);
+                       spin_unlock_irq(&np->tx_lock);
+
+                       /* Free resources. */
+                       free_irq(np->irq, dev);
+                       unbind_evtchn_from_irq(np->evtchn);
+                       free_page((unsigned long)np->tx);
+                       free_page((unsigned long)np->rx);
+               }
+#endif
+
+               /* Move from CLOSED to DISCONNECTED state. */
+               sc->sc_tx = (netif_tx_interface_t *)
+                       uvm_km_valloc_align(kernel_map, PAGE_SIZE, PAGE_SIZE);
+               if (sc->sc_tx == NULL)
+                       panic("netif: no tx va");
+               sc->sc_rx = (netif_rx_interface_t *)
+                       uvm_km_valloc_align(kernel_map, PAGE_SIZE, PAGE_SIZE);
+               if (sc->sc_rx == NULL)
+                       panic("netif: no rx va");
+               pg_tx = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+               if (pg_tx == NULL) {
+                       panic("netif: no tx pages");
+               }
+               pmap_kenter_pa((vaddr_t)sc->sc_tx, VM_PAGE_TO_PHYS(pg_tx),
+                   VM_PROT_READ | VM_PROT_WRITE);
+               pg_rx = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
+               if (pg_rx == NULL) {
+                       panic("netif: no rx pages");
+               }
+               pmap_kenter_pa((vaddr_t)sc->sc_rx, VM_PAGE_TO_PHYS(pg_rx),
+                   VM_PROT_READ | VM_PROT_WRITE);
+               sc->sc_backend_state = BEST_DISCONNECTED;
+
+               /* Construct an interface-CONNECT message for the
+                * domain controller. */
+               cmsg.type      = CMSG_NETIF_FE;
+               cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
+               cmsg.length    = sizeof(netif_fe_interface_connect_t);
+               up.handle      = status->handle;
+               up.tx_shmem_frame = xpmap_ptom(VM_PAGE_TO_PHYS(pg_tx)) >> PAGE_SHIFT;
+               up.rx_shmem_frame = xpmap_ptom(VM_PAGE_TO_PHYS(pg_rx)) >> PAGE_SHIFT;
+               memcpy(cmsg.msg, &up, sizeof(up));
+
+               /* Tell the controller to bring up the interface. */
+               ctrl_if_send_message_block(&cmsg, NULL, 0, 0);
+               break;
+
+       case NETIF_INTERFACE_STATUS_CONNECTED:
+               if (sc->sc_backend_state == BEST_CLOSED) {
+                       printf("Unexpected netif-CONNECTED message"
+                           " in state %d\n", sc->sc_backend_state);
+                       break;
+               }
+
+               memcpy(sc->sc_enaddr, status->mac, ETHER_ADDR_LEN);
+#if 0
+               if (xen_start_info.flags & SIF_PRIVILEGED) {
+                       /* XXX for domain-0 change out ethernet address to be
+                        * different than the physical address since arp
+                        * replies from other domains will report the physical
+                        * address.
+                        */
+                       if (sc->sc_enaddr[0] != 0xaa)
+                               sc->sc_enaddr[0] = 0xaa;
+                       else
+                               sc->sc_enaddr[0] = 0xab;
+               }
+#endif
+
+               /* Recovery procedure: */
+
+               /* Step 1: Reinitialise variables. */
+               sc->sc_rx_resp_cons = sc->sc_tx_resp_cons = /* sc->sc_tx_full = */ 0;
+               sc->sc_rx->event = sc->sc_tx->event = 1;
+
+               /* Step 2: Rebuild the RX and TX ring contents. */
+               network_alloc_rx_buffers(sc);
+               SLIST_INIT(&sc->sc_tx_bufs);
+               network_alloc_tx_buffers(sc);
+
+               /* Step 3: All public and private state should now be
+                * sane.  Get ready to start sending and receiving
+                * packets and give the driver domain a kick because
+                * we've probably just requeued some packets.
+                */
+               sc->sc_backend_state = BEST_CONNECTED;
+               __insn_barrier();
+               hypervisor_notify_via_evtchn(status->evtchn);  
+               network_tx_buf_gc(sc);
+
+               if_attach(ifp);
+               ether_ifattach(ifp, sc->sc_enaddr);
+
+               sc->sc_evtchn = status->evtchn;
+               sc->sc_irq = bind_evtchn_to_irq(sc->sc_evtchn);
+               event_set_handler(sc->sc_irq, &xen_network_handler, sc, IPL_NET);
+               hypervisor_enable_irq(sc->sc_irq);
+               netctrl.xc_connected++;
+
+               aprint_normal("%s: MAC address %s\n", sc->sc_dev.dv_xname,
+                   ether_sprintf(sc->sc_enaddr));
+
+#if NRND > 0
+               rnd_attach_source(&sc->rnd_source, sc->sc_dev.dv_xname,
+                   RND_TYPE_NET, 0);
+#endif
+               break;
+
+       default:
+               printf("Status change to unknown value %d\n", 
+                   status->status);
+               break;
+       }
+}
+
+static void
+xennet_tx_mbuf_free(struct mbuf *m, caddr_t buf, size_t size, void *arg)
+{
+       struct xennet_txbuf *txbuf = (struct xennet_txbuf *)arg;
+
+       DPRINTFN(XEDB_MBUF, ("xennet_tx_mbuf_free %p pa %p\n", txbuf,
+           (void *)txbuf->xt_pa));
+       SLIST_INSERT_HEAD(&txbuf->xt_sc->sc_tx_bufs, txbuf, xt_next);
+       pool_cache_put(&mbpool_cache, m);
+}
+
+static void
+xennet_rx_push_buffer(struct xennet_softc *sc, int id)
+{
+       NETIF_RING_IDX ringidx;
+       int nr_pfns;
+
+       ringidx = sc->sc_rx->req_prod;
+       nr_pfns = 0;
+
+       DPRINTFN(XEDB_MEM, ("readding page va %p pa %p ma %p/%p to rx_ring "
+                    "at %d with id %d\n",
+                    (void *)sc->sc_rx_bufa[id].xb_rx.xbrx_va,
+                    (void *)sc->sc_rx_bufa[id].xb_rx.xbrx_pa,
+                    (void *)(PTE_BASE[x86_btop
+                                 (sc->sc_rx_bufa[id].xb_rx.xbrx_va)] &
+                        PG_FRAME),
+                    (void *)xpmap_ptom(sc->sc_rx_bufa[id].xb_rx.xbrx_pa),
+                    ringidx, id));
+
+       sc->sc_rx->ring[MASK_NETIF_RX_IDX(ringidx)].req.id = id;
+
+       rx_pfn_array[nr_pfns] = xpmap_ptom(sc->sc_rx_bufa[id].xb_rx.xbrx_pa)
+               >> PAGE_SHIFT;
+
+       /* Remove this page from pseudo phys map before
+        * passing back to Xen. */
+       xpmap_phys_to_machine_mapping[(sc->sc_rx_bufa[id].xb_rx.xbrx_pa - XPMAP_OFFSET) >> PAGE_SHIFT] =
+               INVALID_P2M_ENTRY;
+
+       rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
+       rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va >> PAGE_SHIFT;
+       rx_mcl[nr_pfns].args[1] = 0;
+       rx_mcl[nr_pfns].args[2] = 0;
+
+       nr_pfns++;
+
+       sc->sc_rx_bufs_to_notify++;
+
+       ringidx++;
+
+       /*
+        * We may have allocated buffers which have entries
+        * outstanding in the page update queue -- make sure we flush
+        * those first!
+        */
+       xpq_flush_queue();
+
+       /* After all PTEs have been zapped we blow away stale TLB entries. */
+       rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
+
+       /* Give away a batch of pages. */
+       rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
+       rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
+       rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
+       rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
+       rx_mcl[nr_pfns].args[3] = 0;
+       rx_mcl[nr_pfns].args[4] = DOMID_SELF;
+
+       /* Zap PTEs and give away pages in one big multicall. */
+       (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
+
+       /* Check return status of HYPERVISOR_dom_mem_op(). */
+       if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
+               panic("Unable to reduce memory reservation\n");
+
+       /* Above is a suitable barrier to ensure backend will see requests. */
+       sc->sc_rx->req_prod = ringidx;
+}
+
+static void
+xennet_rx_mbuf_free(struct mbuf *m, caddr_t buf, size_t size, void *arg)
+{
+       union xennet_bufarray *xb = (union xennet_bufarray *)arg;
+       struct xennet_softc *sc = xb->xb_rx.xbrx_sc;
+       int id = (xb - sc->sc_rx_bufa);
+
+       DPRINTFN(XEDB_MBUF, ("xennet_rx_mbuf_free id %d, mbuf %p, buf %p, "
+           "size %d\n", id, m, buf, size));
+
+       xennet_rx_push_buffer(sc, id);
+
+       pool_cache_put(&mbpool_cache, m);
+}
+
+static int
+xen_network_handler(void *arg)
+{
+       struct xennet_softc *sc = arg;
+       struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+       netif_rx_response_t *rx;
+       paddr_t pa;
+       NETIF_RING_IDX ringidx;
+       mmu_update_t *mmu = rx_mmu;
+       multicall_entry_t *mcl = rx_mcl;
+       struct mbuf *m;
+
+       network_tx_buf_gc(sc);
+
+ again:
+       for (ringidx = sc->sc_rx_resp_cons;
+            ringidx != sc->sc_rx->resp_prod;
+            ringidx++) {
+               rx = &sc->sc_rx->ring[MASK_NETIF_RX_IDX(ringidx)].resp;
+
+               if (rx->status < 0)
+                       panic("rx->status < 0");
+               /* XXXcl check rx->status for error */
+
+                MGETHDR(m, M_DONTWAIT, MT_DATA);
+                if (m == NULL) {
+                       printf("xennet: rx no mbuf\n");
+                       break;
+               }
+
+               pa = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_pa;
+
+               DPRINTFN(XEDB_EVENT, ("rx event %d for id %d, size %d, "
+                            "status %d, ma %08lx, pa %08lx\n", ringidx,
+                            rx->id, rx->status, rx->status, rx->addr, pa));
+
+               /* Remap the page. */
+               mmu->ptr  = (rx->addr & PG_FRAME) | MMU_MACHPHYS_UPDATE;
+               mmu->val  = (pa - XPMAP_OFFSET) >> PAGE_SHIFT;
+               mmu++;
+               mcl->op = __HYPERVISOR_update_va_mapping;
+               mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va >> PAGE_SHIFT;
+               mcl->args[1] = (rx->addr & PG_FRAME) | PG_V|PG_KW;
+               mcl->args[2] = UVMF_FLUSH_TLB; // 0;
+               mcl++;
+
+               xpmap_phys_to_machine_mapping
+                       [(pa - XPMAP_OFFSET) >> PAGE_SHIFT] =
+                       rx->addr >> PAGE_SHIFT;
+
+               /* Do all the remapping work, and M->P updates, in one
+                * big hypercall. */
+               if ((mcl - rx_mcl) != 0) {
+                       mcl->op = __HYPERVISOR_mmu_update;
+                       mcl->args[0] = (unsigned long)rx_mmu;
+                       mcl->args[1] = mmu - rx_mmu;
+                       mcl->args[2] = 0;
+                       mcl++;
+                       (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
+               }
+               if (0)
+               printf("page mapped at va %08lx -> %08x/%08lx\n",
+                   sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va,
+                   PTE_BASE[x86_btop(sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va)],
+                   rx->addr);
+               mmu = rx_mmu;
+               mcl = rx_mcl;
+
+               DPRINTFN(XEDB_MBUF, ("rx packet mbuf %p va %p pa %p/%p "
+                   "ma %p\n", m,
+                   (void *)sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va,
+                   (void *)(xpmap_mtop(PTE_BASE[x86_btop
+                                           (sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va)] & PG_FRAME)), (void *)pa,
+                   (void *)(PTE_BASE[x86_btop
+                       (sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va)] & PG_FRAME)));
+
+               m->m_len = m->m_pkthdr.len = rx->status;
+               m->m_pkthdr.rcvif = ifp;
+               if (sc->sc_rx->req_prod != sc->sc_rx->resp_prod) {
+                       MEXTADD(m, (void *)(sc->sc_rx_bufa[rx->id].xb_rx.
+                           xbrx_va + (rx->addr & PAGE_MASK)), rx->status, M_DEVBUF,
+                           xennet_rx_mbuf_free,
+                           &sc->sc_rx_bufa[rx->id]);
+               } else {
+                       /*
+                        * This was our last receive buffer, allocate
+                        * memory, copy data and push the receive
+                        * buffer back to the hypervisor.
+                        */
+                       MEXTMALLOC(m, rx->status, M_DONTWAIT);
+                       if ((m->m_flags & M_EXT) == 0) {
+                               printf("xennet: rx no mbuf 2\n");
+                               m_free(m);
+                               break;
+                       }
+                       memcpy(m->m_data, (void *)(sc->sc_rx_bufa[rx->id].
+                           xb_rx.xbrx_va + (rx->addr & PAGE_MASK)), rx->status);
+                       xennet_rx_push_buffer(sc, rx->id);
+               }
+
+#ifdef XENNET_DEBUG_DUMP
+               xennet_hex_dump(mtod(m, u_char *), m->m_pkthdr.len, "r", rx->id);
+#endif
+
+#if NBPFILTER > 0
+               /*
+                * Pass packet to bpf if there is a listener.
+                */
+               if (ifp->if_bpf)
+                       bpf_mtap(ifp->if_bpf, m);
+#endif
+
+               ifp->if_ipackets++;
+
+               /* Pass the packet up. */
+               (*ifp->if_input)(ifp, m);
+       }
+
+       sc->sc_rx_resp_cons = ringidx;
+       sc->sc_rx->event = sc->sc_rx_resp_cons + 1;
+
+       if (sc->sc_rx->resp_prod != ringidx)
+               goto again;
+
+       return 0;
+}
+
+static inline int
+get_bufarray_entry(union xennet_bufarray *a)
+{
+       int idx;
+
+       idx = a[0].xb_next;
+       a[0].xb_next = a[idx].xb_next;
+       return idx;
+}
+
+static inline void
+put_bufarray_entry(union xennet_bufarray *a, int idx)
+{
+
+       a[idx].xb_next = a[0].xb_next;
+       a[0].xb_next = idx;
+}
+
+static void
+network_tx_buf_gc(struct xennet_softc *sc)
+{
+       struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+       NETIF_RING_IDX idx, prod;
+
+       do {
+               prod = sc->sc_tx->resp_prod;
+
+               for (idx = sc->sc_tx_resp_cons; idx != prod; idx++) {
+                       DPRINTFN(XEDB_EVENT, ("tx event at pos %d, status: "
+                                    "%d, id: %d, mbuf %p, buf %p\n", idx,
+                                    sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.status,
+                                    sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id,
+                                    sc->sc_tx_bufa[sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id].xb_tx.xbtx_m,
+                                    mtod(sc->sc_tx_bufa[sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id].xb_tx.xbtx_m, void *)));
+                       m_freem(sc->sc_tx_bufa[sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id].xb_tx.xbtx_m);
+                       put_bufarray_entry(sc->sc_tx_bufa,
+                           sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].resp.id);
+                       sc->sc_tx_entries--; /* atomic */
+               }
+
+               sc->sc_tx_resp_cons = prod;
+
+               /*
+                * Set a new event, then check for race with update of
+                * tx_cons.
+                */
+               sc->sc_tx->event = /* atomic */
+                       prod + (sc->sc_tx_entries >> 1) + 1;
+               __insn_barrier();
+       } while (prod != sc->sc_tx->resp_prod);
+
+       if (sc->sc_tx->resp_prod == sc->sc_tx->req_prod)
+               ifp->if_timer = 0;
+       /* KDASSERT(sc->sc_net_idx->tx_req_prod == */
+       /* TX_RING_ADD(sc->sc_net_idx->tx_resp_prod, sc->sc_tx_entries)); */
+}
+
+static void
+network_alloc_rx_buffers(struct xennet_softc *sc)
+{
+       vaddr_t rxpages, va;
+       paddr_t pa;
+       struct vm_page *pg;
+       int id, nr_pfns;
+       NETIF_RING_IDX ringidx;
+       int s;
+
+       ringidx = sc->sc_rx->req_prod;
+       if (0) printf("network_alloc_rx_buffers prod %d cons %d\n", ringidx,
+           sc->sc_rx_resp_cons);
+       if ((ringidx - sc->sc_rx_resp_cons) > (RX_MAX_ENTRIES / 2))
+               return;
+
+       nr_pfns = 0;
+
+       rxpages = uvm_km_valloc_align(kernel_map, RX_ENTRIES * PAGE_SIZE,
+           PAGE_SIZE);
+
+       s = splnet();
+       for (va = rxpages; va < rxpages + RX_ENTRIES * PAGE_SIZE;
+            va += PAGE_SIZE) {
+               pg = uvm_pagealloc(NULL, 0, NULL, 0);
+               if (pg == NULL)
+                       panic("network_alloc_rx_buffers: no pages");
+               pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+                   VM_PROT_READ | VM_PROT_WRITE);
+
+               id = get_bufarray_entry(sc->sc_rx_bufa);
+               sc->sc_rx_bufa[id].xb_rx.xbrx_va = va;
+               sc->sc_rx_bufa[id].xb_rx.xbrx_sc = sc;
+
+               pa = VM_PAGE_TO_PHYS(pg);
+               DPRINTFN(XEDB_MEM, ("adding page va %p pa %p/%p "
+                   "ma %p/%p to rx_ring at %d with id %d\n", (void *)va,
+                            (void *)(VM_PAGE_TO_PHYS(pg) & PG_FRAME), (void *)xpmap_mtop(PTE_BASE[x86_btop(va)]),
+                   (void *)(PTE_BASE[x86_btop(va)] & PG_FRAME),
+                            (void *)xpmap_ptom(VM_PAGE_TO_PHYS(pg)),
+                   ringidx, id));
+               sc->sc_rx_bufa[id].xb_rx.xbrx_pa = pa;
+               sc->sc_rx->ring[MASK_NETIF_RX_IDX(ringidx)].req.id = id;
+
+               rx_pfn_array[nr_pfns] = xpmap_ptom(pa) >> PAGE_SHIFT;
+
+               /* Remove this page from pseudo phys map before
+                * passing back to Xen. */
+               xpmap_phys_to_machine_mapping[(pa - XPMAP_OFFSET) >> PAGE_SHIFT] =
+                       INVALID_P2M_ENTRY;
+
+               rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
+               rx_mcl[nr_pfns].args[0] = va >> PAGE_SHIFT;
+               rx_mcl[nr_pfns].args[1] = 0;
+               rx_mcl[nr_pfns].args[2] = 0;
+
+               nr_pfns++;
+
+               sc->sc_rx_bufs_to_notify++;
+
+               ringidx++;
+               if ((ringidx - sc->sc_rx_resp_cons) == RX_MAX_ENTRIES)
+                       break;
+       }
+
+       if (nr_pfns == 0) {
+               splx(s);
+               return;
+       }
+
+       /*
+        * We may have allocated buffers which have entries
+        * outstanding in the page update queue -- make sure we flush
+        * those first!
+        */
+       xpq_flush_queue();
+
+       /* After all PTEs have been zapped we blow away stale TLB entries. */
+       rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
+
+       /* Give away a batch of pages. */
+       rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
+       rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
+       rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
+       rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
+       rx_mcl[nr_pfns].args[3] = 0;
+       rx_mcl[nr_pfns].args[4] = DOMID_SELF;
+
+       /* Zap PTEs and give away pages in one big multicall. */
+       (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
+
+       /* Check return status of HYPERVISOR_dom_mem_op(). */
+       if (rx_mcl[nr_pfns].args[5] != nr_pfns)
+               panic("Unable to reduce memory reservation\n");
+
+       /* Above is a suitable barrier to ensure backend will see requests. */
+       sc->sc_rx->req_prod = ringidx;
+
+       splx(s);
+
+}
+
+static void
+network_alloc_tx_buffers(struct xennet_softc *sc)
+{
+       vaddr_t txpages, va;
+       struct vm_page *pg;
+       struct xennet_txbuf *txbuf;
+       int i;
+
+       txpages = uvm_km_valloc_align(kernel_map,
+           (TX_ENTRIES / TXBUF_PER_PAGE) * PAGE_SIZE, PAGE_SIZE);
+       for (va = txpages;
+            va < txpages + (TX_ENTRIES / TXBUF_PER_PAGE) * PAGE_SIZE;
+            va += PAGE_SIZE) {
+               pg = uvm_pagealloc(NULL, 0, NULL, 0);
+               if (pg == NULL)
+                       panic("network_alloc_tx_buffers: no pages");
+               pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
+                   VM_PROT_READ | VM_PROT_WRITE);
+
+               for (i = 0; i < TXBUF_PER_PAGE; i++) {
+                       txbuf = (struct xennet_txbuf *)
+                               (va + i * (PAGE_SIZE / TXBUF_PER_PAGE));
+                       txbuf->xt_sc = sc;
+                       txbuf->xt_pa = VM_PAGE_TO_PHYS(pg) +
+                               i * (PAGE_SIZE / TXBUF_PER_PAGE) +
+                               sizeof(struct xennet_txbuf);
+                       SLIST_INSERT_HEAD(&sc->sc_tx_bufs, txbuf, xt_next);
+               }
+       }
+}
+
+/* 
+ * Called at splnet.
+ */
+void
+xennet_start(struct ifnet *ifp)
+{
+       struct xennet_softc *sc = ifp->if_softc;
+       struct mbuf *m, *new_m;
+       struct xennet_txbuf *txbuf;
+       netif_tx_request_t *txreq;
+       NETIF_RING_IDX idx;
+       paddr_t pa;
+       int bufid;
+
+       DPRINTFN(XEDB_FOLLOW, ("%s: xennet_start()\n", sc->sc_dev.dv_xname));
+
+#ifdef DIAGNOSTIC
+       IFQ_POLL(&ifp->if_snd, m);
+       if (m == 0)
+               panic("%s: No packet to start", sc->sc_dev.dv_xname);
+#endif
+
+       if ((ifp->if_flags & (IFF_RUNNING | IFF_OACTIVE)) != IFF_RUNNING)
+               return;
+
+       idx = sc->sc_tx->req_prod;
+       while (/*CONSTCOND*/1) {
+
+               IFQ_POLL(&ifp->if_snd, m);
+               if (m == NULL)
+                       break;
+
+               switch (m->m_flags & (M_EXT|M_EXT_CLUSTER)) {
+               case M_EXT|M_EXT_CLUSTER:
+                       pa = m->m_ext.ext_paddr +
+                               (m->m_data - m->m_ext.ext_buf);
+                       break;
+               default:
+               case 0:
+                       pa = m->m_paddr + M_BUFOFFSET(m) +
+                               (m->m_data - M_BUFADDR(m));
+                       break;
+               }
+
+               if (m->m_pkthdr.len != m->m_len ||
+                   (pa ^ (pa + m->m_pkthdr.len)) & PG_FRAME) {
+                       txbuf = SLIST_FIRST(&sc->sc_tx_bufs);
+                       if (txbuf == NULL) {
+                               // printf("xennet: no tx bufs\n");
+                               break;
+                       }
+
+                       MGETHDR(new_m, M_DONTWAIT, MT_DATA);
+                       if (new_m == NULL) {
+                               printf("xennet: no mbuf\n");
+                               break;
+                       }
+
+                       SLIST_REMOVE_HEAD(&sc->sc_tx_bufs, xt_next);
+                       IFQ_DEQUEUE(&ifp->if_snd, m);
+
+                       KASSERT(m->m_flags & M_PKTHDR);
+                       M_COPY_PKTHDR(new_m, m);
+                       m_copydata(m, 0, m->m_pkthdr.len, txbuf->xt_buf);
+                       MEXTADD(new_m, txbuf->xt_buf, m->m_pkthdr.len,
+                           M_DEVBUF, xennet_tx_mbuf_free, txbuf);
+                       new_m->m_ext.ext_paddr = txbuf->xt_pa;
+                       new_m->m_len = new_m->m_pkthdr.len = m->m_pkthdr.len;
+
+                       m_freem(m);
+                       m = new_m;
+
+                       pa = m->m_ext.ext_paddr +
+                               (m->m_data - m->m_ext.ext_buf);
+               } else
+                       IFQ_DEQUEUE(&ifp->if_snd, m);
+
+               bufid = get_bufarray_entry(sc->sc_tx_bufa);
+               sc->sc_tx_bufa[bufid].xb_tx.xbtx_m = m;
+
+               DPRINTFN(XEDB_MBUF, ("xennet_start id %d, mbuf %p, buf %p/%p, "
+                            "size %d\n", bufid, m, mtod(m, void *),
+                            (void *)pa, m->m_pkthdr.len));
+#ifdef XENNET_DEBUG_DUMP
+               xennet_hex_dump(mtod(m, u_char *), m->m_pkthdr.len, "s", bufid);
+#endif
+
+               txreq = &sc->sc_tx->ring[MASK_NETIF_TX_IDX(idx)].req;
+               txreq->id = bufid;
+               txreq->addr = xpmap_ptom(pa);
+               txreq->size = m->m_pkthdr.len;
+
+               __insn_barrier();
+               idx++;
+               sc->sc_tx->req_prod = idx;
+
+               sc->sc_tx_entries++; /* XXX atomic */
+
+#ifdef XENNET_DEBUG
+               DPRINTFN(XEDB_MEM, ("packet addr %p/%p, physical %p/%p, "
+                   "m_paddr %p, len %d/%d\n", M_BUFADDR(m), mtod(m, void *),
+                   (void *)*kvtopte(mtod(m, vaddr_t)),
+                   (void *)xpmap_mtop(*kvtopte(mtod(m, vaddr_t))),
+                   (void *)m->m_paddr, m->m_pkthdr.len, m->m_len));
+#endif
+
+#if NBPFILTER > 0
+               /*
+                * Pass packet to bpf if there is a listener.
+                */
+               if (ifp->if_bpf)
+                       bpf_mtap(ifp->if_bpf, m);
+#endif
+       }
+
+       ifp->if_flags &= ~IFF_OACTIVE;
+
+       network_tx_buf_gc(sc);
+
+       __insn_barrier();
+       if (sc->sc_tx->resp_prod != idx)
+               hypervisor_notify_via_evtchn(sc->sc_evtchn);
+
+       ifp->if_timer = 5;
+
+       ifp->if_opackets++;
+
+       DPRINTFN(XEDB_FOLLOW, ("%s: xennet_start() done\n",
+           sc->sc_dev.dv_xname));
+}
+
+int
+xennet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+       struct xennet_softc *sc = ifp->if_softc;
+       struct ifaddr *ifa = (struct ifaddr *)data;
+#ifdef mediacode
+       struct ifreq *ifr = (struct ifreq *)data;
+#endif
+       int s, error = 0;
+
+       s = splnet();
+
+       DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl()\n", sc->sc_dev.dv_xname));
+
+       switch(cmd) {
+       case SIOCSIFADDR:
+               DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOCSIFADDR\n",
+                   sc->sc_dev.dv_xname));
+               ifp->if_flags |= IFF_UP;
+               switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+               case AF_INET:
+                       xennet_init(sc);
+                       arp_ifinit(ifp, ifa);
+                       break;
+#endif
+               default:
+                       xennet_init(sc);
+                       break;
+               }
+               break;
+
+       case SIOCSIFFLAGS:
+               DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOCSIFFLAGS\n",
+                   sc->sc_dev.dv_xname));
+               break;
+
+       case SIOCADDMULTI:
+       case SIOCDELMULTI:
+               DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOC*MULTI\n",
+                   sc->sc_dev.dv_xname));
+               break;
+
+#ifdef mediacode
+       case SIOCGIFMEDIA:
+       case SIOCSIFMEDIA:
+               DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() SIOC*IFMEDIA\n",
+                   sc->sc_dev.dv_xname));
+               error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
+               break;
+#endif
+
+       default:
+               DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl(0x%lx) unknown cmd\n",
+                   sc->sc_dev.dv_xname, cmd));
+               error = EINVAL;
+               break;
+       }
+
+       splx(s);
+
+       DPRINTFN(XEDB_FOLLOW, ("%s: xennet_ioctl() returning %d\n",
+           sc->sc_dev.dv_xname, error));
+
+       return error;
+}
+
+void
+xennet_watchdog(struct ifnet *ifp)
+{
+
+       panic("xennet_watchdog\n");
+}
+
+void
+xennet_init(struct xennet_softc *sc)
+{
+       struct ifnet *ifp = &sc->sc_ethercom.ec_if;
+
+       DPRINTFN(XEDB_FOLLOW, ("%s: xennet_init()\n", sc->sc_dev.dv_xname));
+
+       if (ifp->if_flags & IFF_UP) {
+               if ((ifp->if_flags & IFF_RUNNING) == 0)
+                       xennet_reset(sc);
+
+               ifp->if_flags |= IFF_RUNNING;
+               ifp->if_flags &= ~IFF_OACTIVE;
+               ifp->if_timer = 0;
+       } else {
+               ifp->if_flags &= ~IFF_RUNNING;
+               xennet_reset(sc);
+       }
+}
+
+void
+xennet_reset(struct xennet_softc *sc)
+{
+
+       DPRINTFN(XEDB_FOLLOW, ("%s: xennet_reset()\n", sc->sc_dev.dv_xname));
+}
+
+#ifdef mediacode
+/*
+ * Media change callback.
+ */
+static int
+xennet_mediachange(struct ifnet *ifp)
+{
+       struct xennet_softc *sc = ifp->if_softc;
+
+       switch IFM_SUBTYPE(sc->sc_media.ifm_media) {
+       case IFM_AUTO:
+               break;
+       default:
+               return (1);
+               break;
+       }
+
+       return (0);
+}
+
+/*
+ * Media status callback.
+ */
+static void
+xennet_mediastatus(struct ifnet *ifp, struct ifmediareq *ifmr)
+{
+       struct xennet_softc *sc = ifp->if_softc;
+       
+       if (IFM_SUBTYPE(ifmr->ifm_active) == IFM_AUTO)
+               ifmr->ifm_active = sc->sc_media.ifm_cur->ifm_data;
+
+       ifmr->ifm_status &= ~IFM_AVALID;
+}
+#endif
+
+int
+xennet_bootstatic_callback(struct nfs_diskless *nd)
+{
+       struct ifnet *ifp = nd->nd_ifp;
+       struct xennet_softc *sc = (struct xennet_softc *)ifp->if_softc;
+       union xen_cmdline_parseinfo xcp;
+       struct sockaddr_in *sin;
+
+       memset(&xcp, 0, sizeof(xcp.xcp_netinfo));
+       xcp.xcp_netinfo.xi_ifno = sc->sc_ifno;
+       xcp.xcp_netinfo.xi_root = nd->nd_root.ndm_host;
+       xen_parse_cmdline(XEN_PARSE_NETINFO, &xcp);
+
+       nd->nd_myip.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[0]);
+       nd->nd_gwip.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[2]);
+       nd->nd_mask.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[3]);
+
+       sin = (struct sockaddr_in *) &nd->nd_root.ndm_saddr;
+       memset((caddr_t)sin, 0, sizeof(*sin));
+       sin->sin_len = sizeof(*sin);
+       sin->sin_family = AF_INET;
+       sin->sin_addr.s_addr = ntohl(xcp.xcp_netinfo.xi_ip[1]);
+
+       return (NFS_BOOTSTATIC_HAS_MYIP|NFS_BOOTSTATIC_HAS_GWIP|
+           NFS_BOOTSTATIC_HAS_MASK|NFS_BOOTSTATIC_HAS_SERVADDR|
+           NFS_BOOTSTATIC_HAS_SERVER);
+}
+
+
+#ifdef XENNET_DEBUG_DUMP
+#define XCHR(x) "0123456789abcdef"[(x) & 0xf]
+static void
+xennet_hex_dump(unsigned char *pkt, size_t len, char *type, int id)
+{
+       size_t i, j;
+
+       printf("pkt %p len %d/%x type %s id %d\n", pkt, len, len, type, id);
+       printf("00000000  ");
+       for(i=0; i<len; i++) {
+               printf("%c%c ", XCHR(pkt[i]>>4), XCHR(pkt[i]));
+               if ((i+1) % 16 == 8)
+                       printf(" ");
+               if ((i+1) % 16 == 0) {
+                       printf(" %c", '|');
+                       for(j=0; j<16; j++)
+                               printf("%c", pkt[i-15+j]>=32 &&
+                                   pkt[i-15+j]<127?pkt[i-15+j]:'.');
+                       printf("%c\n%c%c%c%c%c%c%c%c  ", '|', 
+                           XCHR((i+1)>>28), XCHR((i+1)>>24),
+                           XCHR((i+1)>>20), XCHR((i+1)>>16),
+                           XCHR((i+1)>>12), XCHR((i+1)>>8),
+                           XCHR((i+1)>>4), XCHR(i+1));
+               }
+       }
+       printf("\n");
+}
+#undef XCHR
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c

new file mode 100644 (file)

index 0000000..b72ffc9
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c
@@ -0,0 +1,1368 @@
+/* $NetBSD: xbd.c,v 1.9.2.1 2004/05/22 15:59:11 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xbd.c,v 1.9.2.1 2004/05/22 15:59:11 he Exp $");
+
+#include "xbd.h"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/buf.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/ioctl.h>
+#include <sys/device.h>
+#include <sys/disk.h>
+#include <sys/disklabel.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/conf.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+
+#include <uvm/uvm.h>
+
+#include <dev/dkvar.h>
+#include <machine/xbdvar.h>
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/hypervisor-ifs/hypervisor-if.h>
+#include <machine/hypervisor-ifs/vbd.h>
+#include <machine/evtchn.h>
+
+
+static void xbd_attach(struct device *, struct device *, void *);
+static int xbd_detach(struct device *, int);
+
+#if NXBD > 0
+int xbd_match(struct device *, struct cfdata *, void *);
+CFATTACH_DECL(xbd, sizeof(struct xbd_softc),
+    xbd_match, xbd_attach, xbd_detach, NULL);
+
+extern struct cfdriver xbd_cd;
+#endif
+
+#if NWD > 0
+int xbd_wd_match(struct device *, struct cfdata *, void *);
+CFATTACH_DECL(wd, sizeof(struct xbd_softc),
+    xbd_wd_match, xbd_attach, xbd_detach, NULL);
+
+extern struct cfdriver wd_cd;
+#endif
+
+#if NSD > 0
+int xbd_sd_match(struct device *, struct cfdata *, void *);
+CFATTACH_DECL(sd, sizeof(struct xbd_softc),
+    xbd_sd_match, xbd_attach, xbd_detach, NULL);
+
+extern struct cfdriver sd_cd;
+#endif
+
+#if NCD > 0
+int xbd_cd_match(struct device *, struct cfdata *, void *);
+CFATTACH_DECL(cd, sizeof(struct xbd_softc),
+    xbd_cd_match, xbd_attach, xbd_detach, NULL);
+
+extern struct cfdriver cd_cd;
+#endif
+
+
+dev_type_open(xbdopen);
+dev_type_close(xbdclose);
+dev_type_read(xbdread);
+dev_type_write(xbdwrite);
+dev_type_ioctl(xbdioctl);
+dev_type_ioctl(xbdioctl_cdev);
+dev_type_strategy(xbdstrategy);
+dev_type_dump(xbddump);
+dev_type_size(xbdsize);
+
+#if NXBD > 0
+const struct bdevsw xbd_bdevsw = {
+       xbdopen, xbdclose, xbdstrategy, xbdioctl,
+       xbddump, xbdsize, D_DISK
+};
+
+const struct cdevsw xbd_cdevsw = {
+       xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev,
+       nostop, notty, nopoll, nommap, nokqfilter, D_DISK
+};
+
+static dev_t xbd_major;
+#endif
+
+#if NWD > 0
+const struct bdevsw wd_bdevsw = {
+       xbdopen, xbdclose, xbdstrategy, xbdioctl,
+       xbddump, xbdsize, D_DISK
+};
+
+const struct cdevsw wd_cdevsw = {
+       xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev,
+       nostop, notty, nopoll, nommap, nokqfilter, D_DISK
+};
+
+static dev_t xbd_wd_major;
+static dev_t xbd_wd_cdev_major;
+#endif
+
+#if NSD > 0
+const struct bdevsw sd_bdevsw = {
+       xbdopen, xbdclose, xbdstrategy, xbdioctl,
+       xbddump, xbdsize, D_DISK
+};
+
+const struct cdevsw sd_cdevsw = {
+       xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev,
+       nostop, notty, nopoll, nommap, nokqfilter, D_DISK
+};
+
+static dev_t xbd_sd_major;
+static dev_t xbd_sd_cdev_major;
+#endif
+
+#if NCD > 0
+const struct bdevsw cd_bdevsw = {
+       xbdopen, xbdclose, xbdstrategy, xbdioctl,
+       xbddump, xbdsize, D_DISK
+};
+
+const struct cdevsw cd_cdevsw = {
+       xbdopen, xbdclose, xbdread, xbdwrite, xbdioctl_cdev,
+       nostop, notty, nopoll, nommap, nokqfilter, D_DISK
+};
+
+static dev_t xbd_cd_major;
+static dev_t xbd_cd_cdev_major;
+#endif
+
+
+static int     xbdstart(struct dk_softc *, struct buf *);
+static int     xbd_response_handler(void *);
+static void    xbd_update_create_kthread(void *);
+static void    xbd_update_kthread(void *);
+static int     xbd_update_handler(void *);
+
+static int     xbdinit(struct xbd_softc *, xen_disk_t *, struct dk_intf *);
+
+/* Pseudo-disk Interface */
+static struct dk_intf dkintf_esdi = {
+       DTYPE_ESDI,
+       "Xen Virtual ESDI",
+       xbdopen,
+       xbdclose,
+       xbdstrategy,
+       xbdstart,
+};
+#if NSD > 0
+static struct dk_intf dkintf_scsi = {
+       DTYPE_SCSI,
+       "Xen Virtual SCSI",
+       xbdopen,
+       xbdclose,
+       xbdstrategy,
+       xbdstart,
+};
+#endif
+
+#if NXBD > 0
+static struct xbd_attach_args xbd_ata = {
+       .xa_device = "xbd",
+       .xa_dkintf = &dkintf_esdi,
+};
+#endif
+
+#if NWD > 0
+static struct xbd_attach_args wd_ata = {
+       .xa_device = "wd",
+       .xa_dkintf = &dkintf_esdi,
+};
+#endif
+
+#if NSD > 0
+static struct xbd_attach_args sd_ata = {
+       .xa_device = "sd",
+       .xa_dkintf = &dkintf_scsi,
+};
+#endif
+
+#if NCD > 0
+static struct xbd_attach_args cd_ata = {
+       .xa_device = "cd",
+       .xa_dkintf = &dkintf_esdi,
+};
+#endif
+
+static struct sysctlnode *diskcookies;
+
+
+#if defined(XBDDEBUG) && !defined(DEBUG)
+#define DEBUG
+#endif
+
+#ifdef DEBUG
+int xbddebug = 0;
+
+#define XBDB_FOLLOW    0x1
+#define XBDB_IO                0x2
+#define XBDB_SETUP     0x4
+#define XBDB_HOTPLUG   0x8
+
+#define IFDEBUG(x,y)           if (xbddebug & (x)) y
+#define DPRINTF(x,y)           IFDEBUG(x, printf y)
+#define DPRINTF_FOLLOW(y)      DPRINTF(XBDB_FOLLOW, y)
+#define        DEBUG_MARK_UNUSED(_xr)  (_xr)->xr_sc = (void *)0xdeadbeef
+
+struct xbdreq *xbd_allxr;
+#else
+#define IFDEBUG(x,y)
+#define DPRINTF(x,y)
+#define DPRINTF_FOLLOW(y)
+#define        DEBUG_MARK_UNUSED(_xr)
+#endif
+
+#ifdef DIAGNOSTIC
+#define DIAGPANIC(x)           panic x 
+#define DIAGCONDPANIC(x,y)     if (x) panic y
+#else
+#define DIAGPANIC(x)
+#define DIAGCONDPANIC(x,y)
+#endif
+
+
+struct xbdreq {
+       union {
+               SLIST_ENTRY(xbdreq) _unused;    /* ptr. to next free xbdreq */
+               SIMPLEQ_ENTRY(xbdreq) _suspended;
+                                       /* link when on suspended queue. */
+       } _link;
+       struct xbdreq           *xr_parent;     /* ptr. to parent xbdreq */
+       struct buf              *xr_bp;         /* ptr. to original I/O buf */
+       daddr_t                 xr_bn;          /* block no. to process */
+       long                    xr_bqueue;      /* bytes left to queue */
+       long                    xr_bdone;       /* bytes left */
+       vaddr_t                 xr_data;        /* ptr. to data to be proc. */
+       vaddr_t                 xr_aligned;     /* ptr. to aligned data */
+       long                    xr_breq;        /* bytes in this req. */
+       struct xbd_softc        *xr_sc;         /* ptr. to xbd softc */
+};
+#define        xr_unused       _link._unused
+#define        xr_suspended    _link._suspended
+
+SLIST_HEAD(,xbdreq) xbdreqs =
+       SLIST_HEAD_INITIALIZER(xbdreqs);
+static SIMPLEQ_HEAD(, xbdreq) xbdr_suspended =
+       SIMPLEQ_HEAD_INITIALIZER(xbdr_suspended);
+
+#define        CANGET_XBDREQ() (!SLIST_EMPTY(&xbdreqs))
+
+#define        GET_XBDREQ(_xr) do {                            \
+       (_xr) = SLIST_FIRST(&xbdreqs);                  \
+       if (__predict_true(_xr))                        \
+               SLIST_REMOVE_HEAD(&xbdreqs, xr_unused); \
+} while (/*CONSTCOND*/0)
+
+#define        PUT_XBDREQ(_xr) do {                            \
+       DEBUG_MARK_UNUSED(_xr);                         \
+       SLIST_INSERT_HEAD(&xbdreqs, _xr, xr_unused);    \
+} while (/*CONSTCOND*/0)
+
+static struct bufq_state bufq;
+static int bufq_users = 0;
+
+#define XEN_MAJOR(_dev)        ((_dev) >> 8)
+#define XEN_MINOR(_dev)        ((_dev) & 0xff)
+
+#define        XEN_SCSI_DISK0_MAJOR    8
+#define        XEN_SCSI_DISK1_MAJOR    65
+#define        XEN_SCSI_DISK2_MAJOR    66
+#define        XEN_SCSI_DISK3_MAJOR    67
+#define        XEN_SCSI_DISK4_MAJOR    68
+#define        XEN_SCSI_DISK5_MAJOR    69
+#define        XEN_SCSI_DISK6_MAJOR    70
+#define        XEN_SCSI_DISK7_MAJOR    71
+#define        XEN_SCSI_DISK8_MAJOR    128
+#define        XEN_SCSI_DISK9_MAJOR    129
+#define        XEN_SCSI_DISK10_MAJOR   130
+#define        XEN_SCSI_DISK11_MAJOR   131
+#define        XEN_SCSI_DISK12_MAJOR   132
+#define        XEN_SCSI_DISK13_MAJOR   133
+#define        XEN_SCSI_DISK14_MAJOR   134
+#define        XEN_SCSI_DISK15_MAJOR   135
+#define        XEN_SCSI_CDROM_MAJOR    11
+
+#define        XEN_IDE0_MAJOR          3
+#define        XEN_IDE1_MAJOR          22
+#define        XEN_IDE2_MAJOR          33
+#define        XEN_IDE3_MAJOR          34
+#define        XEN_IDE4_MAJOR          56
+#define        XEN_IDE5_MAJOR          57
+#define        XEN_IDE6_MAJOR          88
+#define        XEN_IDE7_MAJOR          89
+#define        XEN_IDE8_MAJOR          90
+#define        XEN_IDE9_MAJOR          91
+
+#define        XEN_BSHIFT      9               /* log2(XEN_BSIZE) */
+#define        XEN_BSIZE       (1 << XEN_BSHIFT)
+
+#define MAX_VBDS 64
+static int nr_vbds;
+static xen_disk_t *vbd_info;
+
+static blk_ring_t *blk_ring = NULL;
+static BLK_RING_IDX resp_cons; /* Response consumer for comms ring. */
+static BLK_RING_IDX req_prod;  /* Private request producer.         */
+static BLK_RING_IDX last_req_prod;  /* Request producer at last trap. */
+
+#define STATE_ACTIVE    0
+#define STATE_SUSPENDED 1
+#define STATE_CLOSED    2
+static unsigned int state = STATE_SUSPENDED;
+
+
+#define XBDUNIT(x)             DISKUNIT(x)
+#define GETXBD_SOFTC(_xs, x)   if (!((_xs) = getxbd_softc(x))) return ENXIO
+#define GETXBD_SOFTC_CDEV(_xs, x) do {                 \
+       dev_t bx = devsw_chr2blk((x));                  \
+       if (bx == NODEV)                                \
+               return ENXIO;                           \
+       if (!((_xs) = getxbd_softc(bx)))                \
+               return ENXIO;                           \
+} while (/*CONSTCOND*/0)
+
+static struct xbd_softc *
+getxbd_softc(dev_t dev)
+{
+       int     unit = XBDUNIT(dev);
+
+       DPRINTF_FOLLOW(("getxbd_softc(0x%x): major = %d unit = %d\n", dev,
+           major(dev), unit));
+#if NXBD > 0
+       if (major(dev) == xbd_major)
+               return device_lookup(&xbd_cd, unit);
+#endif
+#if NWD > 0
+       if (major(dev) == xbd_wd_major || major(dev) == xbd_wd_cdev_major)
+               return device_lookup(&wd_cd, unit);
+#endif
+#if NSD > 0
+       if (major(dev) == xbd_sd_major || major(dev) == xbd_sd_cdev_major)
+               return device_lookup(&sd_cd, unit);
+#endif
+#if NCD > 0
+       if (major(dev) == xbd_cd_major || major(dev) == xbd_cd_cdev_major)
+               return device_lookup(&cd_cd, unit);
+#endif
+       return NULL;
+}
+
+static int
+get_vbd_info(xen_disk_t *disk_info)
+{
+       int err;
+       block_io_op_t op; 
+
+       /* Probe for disk information. */
+       memset(&op, 0, sizeof(op)); 
+       op.cmd = BLOCK_IO_OP_VBD_PROBE; 
+       op.u.probe_params.domain = 0; 
+       op.u.probe_params.xdi.max = MAX_VBDS;
+       op.u.probe_params.xdi.disks = disk_info;
+       op.u.probe_params.xdi.count = 0;
+
+       err = HYPERVISOR_block_io_op(&op);
+       if (err) {
+               printf("WARNING: Could not probe disks (%d)\n", err);
+               DIAGPANIC(("get_vbd_info: Could not probe disks (%d)", err));
+               return -1;
+       }
+
+       return op.u.probe_params.xdi.count;
+}
+
+static void
+reset_interface(void)
+{
+       block_io_op_t op; 
+
+       op.cmd = BLOCK_IO_OP_RESET;
+       if (HYPERVISOR_block_io_op(&op) != 0)
+               printf("xbd: Possible blkdev trouble: couldn't reset ring\n");
+}
+
+static void
+init_interface(void)
+{
+       block_io_op_t op; 
+
+       reset_interface();
+
+       if (blk_ring == NULL) {
+               op.cmd = BLOCK_IO_OP_RING_ADDRESS;
+               (void)HYPERVISOR_block_io_op(&op);
+
+               blk_ring = (blk_ring_t *)uvm_km_valloc_align(kernel_map,
+                   PAGE_SIZE, PAGE_SIZE);
+               pmap_kenter_ma((vaddr_t)blk_ring, op.u.ring_mfn << PAGE_SHIFT,
+                   VM_PROT_READ|VM_PROT_WRITE);
+               DPRINTF(XBDB_SETUP, ("init_interface: "
+                   "ring va %p and wired to %p\n",
+                   blk_ring, (void *)(op.u.ring_mfn << PAGE_SHIFT)));
+
+               blk_ring->req_prod = blk_ring->resp_prod =
+                       resp_cons = req_prod = last_req_prod = 0;
+
+               event_set_handler(_EVENT_BLKDEV, &xbd_response_handler,
+                   NULL, IPL_BIO);
+               hypervisor_enable_event(_EVENT_BLKDEV);
+       }
+
+       __insn_barrier();
+       state = STATE_ACTIVE;
+}
+
+static void
+enable_update_events(struct device *self)
+{
+
+       kthread_create(xbd_update_create_kthread, self);
+       event_set_handler(_EVENT_VBD_UPD, &xbd_update_handler, self, IPL_BIO);
+       hypervisor_enable_event(_EVENT_VBD_UPD);
+}
+
+static void
+signal_requests_to_xen(void)
+{
+       block_io_op_t op; 
+
+       DPRINTF(XBDB_IO, ("signal_requests_to_xen: %d -> %d\n",
+           blk_ring->req_prod, MASK_BLK_IDX(req_prod)));
+       blk_ring->req_prod = MASK_BLK_IDX(req_prod);
+       last_req_prod = req_prod;
+
+       op.cmd = BLOCK_IO_OP_SIGNAL; 
+       HYPERVISOR_block_io_op(&op);
+       return;
+}
+
+static void
+setup_sysctl(void)
+{
+       struct sysctlnode *pnode;
+
+       sysctl_createv(NULL, 0, NULL, NULL,
+                      0,
+                      CTLTYPE_NODE, "machdep", NULL,
+                      NULL, 0, NULL, 0,
+                      CTL_MACHDEP, CTL_EOL);
+
+       sysctl_createv(NULL, 0, NULL, &pnode,
+                      0,
+                      CTLTYPE_NODE, "domain0", NULL,
+                      NULL, 0, NULL, 0,
+                      CTL_MACHDEP, CTL_CREATE, CTL_EOL);
+
+       if (pnode == NULL)
+               return;
+
+       sysctl_createv(NULL, 0, &pnode, &pnode,
+                      0,
+                      CTLTYPE_NODE, "diskcookie", NULL,
+                      NULL, 0, NULL, 0,
+                      CTL_CREATE, CTL_EOL);
+
+       if (pnode)
+               diskcookies = pnode;
+}
+
+static struct xbd_attach_args *
+get_xbda(xen_disk_t *xd)
+{
+
+       switch (XEN_MAJOR(xd->device)) {
+#if NSD > 0
+       case XEN_SCSI_DISK0_MAJOR:
+       case XEN_SCSI_DISK1_MAJOR ... XEN_SCSI_DISK7_MAJOR:
+       case XEN_SCSI_DISK8_MAJOR ... XEN_SCSI_DISK15_MAJOR:
+               if (xd->capacity == 0)
+                       return NULL;
+               return &sd_ata;
+       case XEN_SCSI_CDROM_MAJOR:
+               return &cd_ata;
+#endif
+#if NWD > 0
+       case XEN_IDE0_MAJOR:
+       case XEN_IDE1_MAJOR:
+       case XEN_IDE2_MAJOR:
+       case XEN_IDE3_MAJOR:
+       case XEN_IDE4_MAJOR:
+       case XEN_IDE5_MAJOR:
+       case XEN_IDE6_MAJOR:
+       case XEN_IDE7_MAJOR:
+       case XEN_IDE8_MAJOR:
+       case XEN_IDE9_MAJOR:
+               switch (XD_TYPE(xd->info)) {
+               case XD_TYPE_CDROM:
+                       return &cd_ata;
+               case XD_TYPE_DISK:
+                       if (xd->capacity == 0)
+                               return NULL;
+                       return &wd_ata;
+               default:
+                       return NULL;
+               }
+               break;
+#endif
+       default:
+               if (xd->capacity == 0)
+                       return NULL;
+               return &xbd_ata;
+       }
+       return NULL;
+}
+
+int
+xbd_scan(struct device *self, struct xbd_attach_args *mainbus_xbda,
+    cfprint_t print)
+{
+       struct xbdreq *xr;
+       struct xbd_attach_args *xbda;
+       xen_disk_t *xd;
+       int i;
+
+       init_interface();
+       if (xen_start_info.flags & SIF_PRIVILEGED)
+               setup_sysctl();
+
+#if NXBD > 0
+       xbd_major = devsw_name2blk("xbd", NULL, 0);
+#endif
+#if NWD > 0
+       xbd_wd_major = devsw_name2blk("wd", NULL, 0);
+       /* XXX Also handle the cdev majors since stuff like
+        * read_sector calls strategy on the cdev.  This only works if
+        * all the majors we care about are different.
+        */
+       xbd_wd_cdev_major = major(devsw_blk2chr(makedev(xbd_wd_major, 0)));
+#endif
+#if NSD > 0
+       xbd_sd_major = devsw_name2blk("sd", NULL, 0);
+       xbd_sd_cdev_major = major(devsw_blk2chr(makedev(xbd_sd_major, 0)));
+#endif
+#if NCD > 0
+       xbd_cd_major = devsw_name2blk("cd", NULL, 0);
+       xbd_cd_cdev_major = major(devsw_blk2chr(makedev(xbd_cd_major, 0)));
+#endif
+
+       MALLOC(xr, struct xbdreq *, BLK_RING_SIZE * sizeof(struct xbdreq),
+           M_DEVBUF, M_WAITOK | M_ZERO);
+#ifdef DEBUG
+       xbd_allxr = xr;
+#endif
+
+       /* XXX Xen1.2: We cannot use BLK_RING_SIZE many slots, since
+        * Xen 1.2 keeps indexes masked in the ring and the case where
+        * we queue all slots at once is handled wrong. 
+        */
+       for (i = 0; i < BLK_RING_SIZE - 1; i++)
+               PUT_XBDREQ(&xr[i]);
+
+       MALLOC(vbd_info, xen_disk_t *, MAX_VBDS * sizeof(xen_disk_t),
+           M_DEVBUF, M_WAITOK);
+       memset(vbd_info, 0, MAX_VBDS * sizeof(xen_disk_t));
+       nr_vbds  = get_vbd_info(vbd_info);
+       if (nr_vbds <= 0)
+               goto out;
+
+       for (i = 0; i < nr_vbds; i++) {
+               xd = &vbd_info[i];
+               xbda = get_xbda(xd);
+               if (xbda) {
+                       xbda->xa_xd = xd;
+                       config_found(self, xbda, print);
+               }
+       }
+
+       enable_update_events(self);
+
+       return 0;
+
+ out:
+       FREE(vbd_info, M_DEVBUF);
+       vbd_info = NULL;
+       FREE(xr, M_DEVBUF);
+#ifdef DEBUG
+       xbd_allxr = NULL;
+#endif
+       SLIST_INIT(&xbdreqs);
+       return 0;
+}
+
+#if NXBD > 0
+int
+xbd_match(struct device *parent, struct cfdata *match, void *aux)
+{
+       struct xbd_attach_args *xa = (struct xbd_attach_args *)aux;
+
+       if (strcmp(xa->xa_device, "xbd") == 0)
+               return 1;
+       return 0;
+}
+#endif
+
+#if NWD > 0
+int
+xbd_wd_match(struct device *parent, struct cfdata *match, void *aux)
+{
+       struct xbd_attach_args *xa = (struct xbd_attach_args *)aux;
+
+       if (strcmp(xa->xa_device, "wd") == 0)
+               return 1;
+       return 0;
+}
+#endif
+
+#if NSD > 0
+int
+xbd_sd_match(struct device *parent, struct cfdata *match, void *aux)
+{
+       struct xbd_attach_args *xa = (struct xbd_attach_args *)aux;
+
+       if (strcmp(xa->xa_device, "sd") == 0)
+               return 1;
+       return 0;
+}
+#endif
+
+#if NCD > 0
+int
+xbd_cd_match(struct device *parent, struct cfdata *match, void *aux)
+{
+       struct xbd_attach_args *xa = (struct xbd_attach_args *)aux;
+
+       if (strcmp(xa->xa_device, "cd") == 0)
+               return 1;
+       return 0;
+}
+#endif
+
+static void
+xbd_attach(struct device *parent, struct device *self, void *aux)
+{
+       struct xbd_attach_args *xbda = (struct xbd_attach_args *)aux;
+       struct xbd_softc *xs = (struct xbd_softc *)self;
+
+       aprint_normal(": Xen Virtual Block Device");
+
+       simple_lock_init(&xs->sc_slock);
+       dk_sc_init(&xs->sc_dksc, xs, xs->sc_dev.dv_xname);
+       xbdinit(xs, xbda->xa_xd, xbda->xa_dkintf);
+       if (diskcookies) {
+               /* XXX beware that xs->sc_xd_device is a long */
+               sysctl_createv(NULL, 0, &diskcookies, NULL,
+                   0,
+                   CTLTYPE_INT, xs->sc_dev.dv_xname, NULL,
+                   NULL, 0, &xs->sc_xd_device, 0,
+                   CTL_CREATE, CTL_EOL);
+       }
+}
+
+static int
+xbd_detach(struct device *dv, int flags)
+{
+       struct  xbd_softc *xs = (struct xbd_softc *)dv;
+
+       /* 
+        * Mark disk about to be removed (between now and when the xs
+        * will be freed).
+        */
+       xs->sc_shutdown = 1;
+
+       /* And give it some time to settle if it's busy. */
+       if (xs->sc_dksc.sc_dkdev.dk_busy > 0)
+               tsleep(&xs, PWAIT, "xbdetach", hz);
+
+       /* Detach the disk. */
+       disk_detach(&xs->sc_dksc.sc_dkdev);
+
+       /* XXX decrement bufq_users and free? */
+
+       /* XXX no need to remove sysctl nodes since they only exist
+        * in domain0 and domain0's devices are never removed.
+        */
+
+       return 0;
+}
+
+int
+xbdopen(dev_t dev, int flags, int fmt, struct proc *p)
+{
+       struct  xbd_softc *xs;
+
+       DPRINTF_FOLLOW(("xbdopen(0x%04x, %d)\n", dev, flags));
+       switch (fmt) {
+       case S_IFCHR:
+               GETXBD_SOFTC_CDEV(xs, dev);
+               break;
+       case S_IFBLK:
+               GETXBD_SOFTC(xs, dev);
+               break;
+       default:
+               return ENXIO;
+       }
+       return dk_open(xs->sc_di, &xs->sc_dksc, dev, flags, fmt, p);
+}
+
+int
+xbdclose(dev_t dev, int flags, int fmt, struct proc *p)
+{
+       struct  xbd_softc *xs;
+
+       DPRINTF_FOLLOW(("xbdclose(%d, %d)\n", dev, flags));
+       switch (fmt) {
+       case S_IFCHR:
+               GETXBD_SOFTC_CDEV(xs, dev);
+               break;
+       case S_IFBLK:
+               GETXBD_SOFTC(xs, dev);
+               break;
+       default:
+               return ENXIO;
+       }
+       return dk_close(xs->sc_di, &xs->sc_dksc, dev, flags, fmt, p);
+}
+
+void
+xbdstrategy(struct buf *bp)
+{
+       struct  xbd_softc *xs = getxbd_softc(bp->b_dev);
+
+       DPRINTF_FOLLOW(("xbdstrategy(%p): b_bcount = %ld\n", bp,
+           (long)bp->b_bcount));
+
+       if (xs == NULL || xs->sc_shutdown) {
+               bp->b_flags |= B_ERROR;
+               bp->b_error = EIO;
+               biodone(bp);
+               return;
+       }
+
+       dk_strategy(xs->sc_di, &xs->sc_dksc, bp);
+       return;
+}
+
+int
+xbdsize(dev_t dev)
+{
+       struct xbd_softc *xs = getxbd_softc(dev);
+
+       DPRINTF_FOLLOW(("xbdsize(%d)\n", dev));
+       if (xs == NULL || xs->sc_shutdown)
+               return -1;
+       return dk_size(xs->sc_di, &xs->sc_dksc, dev);
+}
+
+static void
+map_align(struct xbdreq *xr)
+{
+       int s;
+
+       s = splvm();
+       xr->xr_aligned = uvm_km_kmemalloc1(kmem_map, NULL,
+           xr->xr_bqueue, XEN_BSIZE, UVM_UNKNOWN_OFFSET,
+           0/*  UVM_KMF_NOWAIT */);
+       splx(s);
+       DPRINTF(XBDB_IO, ("map_align(%p): bp %p addr %p align 0x%08lx "
+           "size 0x%04lx\n", xr, xr->xr_bp, xr->xr_bp->b_data,
+           xr->xr_aligned, xr->xr_bqueue));
+       xr->xr_data = xr->xr_aligned;
+       if ((xr->xr_bp->b_flags & B_READ) == 0)
+               memcpy((void *)xr->xr_aligned, xr->xr_bp->b_data,
+                   xr->xr_bqueue);
+}
+
+static void
+unmap_align(struct xbdreq *xr)
+{
+       int s;
+
+       if (xr->xr_bp->b_flags & B_READ)
+               memcpy(xr->xr_bp->b_data, (void *)xr->xr_aligned,
+                   xr->xr_bp->b_bcount);
+       DPRINTF(XBDB_IO, ("unmap_align(%p): bp %p addr %p align 0x%08lx "
+           "size 0x%04lx\n", xr, xr->xr_bp, xr->xr_bp->b_data,
+           xr->xr_aligned, xr->xr_bp->b_bcount));
+       s = splvm();
+       uvm_km_free(kmem_map, xr->xr_aligned, xr->xr_bp->b_bcount);
+       splx(s);
+       xr->xr_aligned = (vaddr_t)0;
+}
+
+static void
+fill_ring(struct xbdreq *xr)
+{
+       struct xbdreq *pxr = xr->xr_parent;
+       paddr_t pa;
+       unsigned long ma;
+       vaddr_t addr, off;
+       blk_ring_req_entry_t *ring_req;
+       int breq, nr_sectors;
+
+       /* Fill out a communications ring structure. */
+       ring_req = &blk_ring->ring[MASK_BLK_IDX(req_prod)].req;
+       ring_req->id = (unsigned long)xr;
+       ring_req->operation = pxr->xr_bp->b_flags & B_READ ? XEN_BLOCK_READ :
+               XEN_BLOCK_WRITE;
+       ring_req->sector_number = (xen_sector_t)pxr->xr_bn;
+       ring_req->device = pxr->xr_sc->sc_xd_device;
+
+       DPRINTF(XBDB_IO, ("fill_ring(%d): bp %p sector %llu pxr %p xr %p\n",
+           MASK_BLK_IDX(req_prod), pxr->xr_bp, (unsigned long long)pxr->xr_bn,
+           pxr, xr));
+
+       xr->xr_breq = 0;
+       ring_req->nr_segments = 0;
+       addr = trunc_page(pxr->xr_data);
+       off = pxr->xr_data - addr;
+       while (pxr->xr_bqueue > 0) {
+#if 0
+               pmap_extract(vm_map_pmap(&bp->b_proc->p_vmspace->vm_map),
+                   addr, &pa);
+#else
+               pmap_extract(pmap_kernel(), addr, &pa);
+#endif
+               ma = xpmap_ptom_masked(pa) + off;
+               DIAGCONDPANIC((ma & (XEN_BSIZE - 1)) != 0,
+                   ("xbd request ma not sector aligned"));
+
+               if (pxr->xr_bqueue > PAGE_SIZE - off)
+                       breq = PAGE_SIZE - off;
+               else
+                       breq = pxr->xr_bqueue;
+               nr_sectors = breq >> XEN_BSHIFT;
+               DIAGCONDPANIC(nr_sectors >= XEN_BSIZE,
+                   ("xbd request nr_sectors >= XEN_BSIZE"));
+
+               DPRINTF(XBDB_IO, ("fill_ring(%d): va 0x%08lx pa 0x%08lx "
+                   "ma 0x%08lx, sectors %d, left %ld/%ld\n",
+                   MASK_BLK_IDX(req_prod), addr, pa, ma, nr_sectors,
+                   pxr->xr_bqueue >> XEN_BSHIFT, pxr->xr_bqueue));
+
+               ring_req->buffer_and_sects[ring_req->nr_segments++] =
+                       ma | nr_sectors;
+               addr += PAGE_SIZE;
+               pxr->xr_bqueue -= breq;
+               pxr->xr_bn += nr_sectors;
+               xr->xr_breq += breq;
+               off = 0;
+               if (ring_req->nr_segments == MAX_BLK_SEGS)
+                       break;
+       }
+       pxr->xr_data = addr;
+
+       req_prod++;
+}
+
+static void
+xbdresume(void)
+{
+       struct xbdreq *pxr, *xr;
+       struct xbd_softc *xs;
+       struct buf *bp;
+
+       while ((pxr = SIMPLEQ_FIRST(&xbdr_suspended)) != NULL) {
+               DPRINTF(XBDB_IO, ("xbdstart: resuming xbdreq %p for bp %p\n",
+                   pxr, pxr->xr_bp));
+               bp = pxr->xr_bp;
+               xs = getxbd_softc(bp->b_dev);
+               if (xs == NULL || xs->sc_shutdown) {
+                       bp->b_flags |= B_ERROR;
+                       bp->b_error = EIO;
+               }
+               if (bp->b_flags & B_ERROR) {
+                       pxr->xr_bdone -= pxr->xr_bqueue;
+                       pxr->xr_bqueue = 0;
+                       if (pxr->xr_bdone == 0) {
+                               bp->b_resid = bp->b_bcount;
+                               if (pxr->xr_aligned)
+                                       unmap_align(pxr);
+                               PUT_XBDREQ(pxr);
+                               if (xs)
+                                       disk_unbusy(&xs->sc_dksc.sc_dkdev,
+                                           (bp->b_bcount - bp->b_resid),
+                                           (bp->b_flags & B_READ));
+                               biodone(bp);
+                       }
+                       continue;
+               }
+               while (__predict_true(pxr->xr_bqueue > 0)) {
+                       GET_XBDREQ(xr);
+                       if (__predict_false(xr == NULL))
+                               goto out;
+                       xr->xr_parent = pxr;
+                       fill_ring(xr);
+               }
+               DPRINTF(XBDB_IO, ("xbdstart: resumed xbdreq %p for bp %p\n",
+                   pxr, bp));
+               SIMPLEQ_REMOVE_HEAD(&xbdr_suspended, xr_suspended);
+       }
+
+ out:
+       return;
+}
+
+static int
+xbdstart(struct dk_softc *dksc, struct buf *bp)
+{
+       struct  xbd_softc *xs;
+       struct xbdreq *pxr, *xr;
+       struct  partition *pp;
+       daddr_t bn;
+       int ret, runqueue;
+
+       DPRINTF_FOLLOW(("xbdstart(%p, %p)\n", dksc, bp));
+
+       runqueue = 1;
+       ret = -1;
+
+       xs = getxbd_softc(bp->b_dev);
+       if (xs == NULL || xs->sc_shutdown) {
+               bp->b_flags |= B_ERROR;
+               bp->b_error = EIO;
+               biodone(bp);
+               return 0;
+       }
+       dksc = &xs->sc_dksc;
+
+       /* XXXrcd:
+        * Translate partition relative blocks to absolute blocks,
+        * this probably belongs (somehow) in dksubr.c, since it
+        * is independant of the underlying code...  This will require
+        * that the interface be expanded slightly, though.
+        */
+       bn = bp->b_blkno;
+       if (DISKPART(bp->b_dev) != RAW_PART) {
+               pp = &xs->sc_dksc.sc_dkdev.dk_label->
+                       d_partitions[DISKPART(bp->b_dev)];
+               bn += pp->p_offset;
+       }
+
+       DPRINTF(XBDB_IO, ("xbdstart: addr %p, sector %llu, "
+           "count %ld [%s]\n", bp->b_data, (unsigned long long)bn,
+           bp->b_bcount, bp->b_flags & B_READ ? "read" : "write"));
+
+       GET_XBDREQ(pxr);
+       if (__predict_false(pxr == NULL))
+               goto out;
+
+       disk_busy(&dksc->sc_dkdev); /* XXX: put in dksubr.c */
+       /*
+        * We have a request slot, return 0 to make dk_start remove
+        * the bp from the work queue.
+        */
+       ret = 0;
+
+       pxr->xr_bp = bp;
+       pxr->xr_parent = pxr;
+       pxr->xr_bn = bn;
+       pxr->xr_bqueue = bp->b_bcount;
+       pxr->xr_bdone = bp->b_bcount;
+       pxr->xr_data = (vaddr_t)bp->b_data;
+       pxr->xr_sc = xs;
+
+       if (pxr->xr_data & (XEN_BSIZE - 1))
+               map_align(pxr);
+
+       fill_ring(pxr);
+
+       while (__predict_false(pxr->xr_bqueue > 0)) {
+               GET_XBDREQ(xr);
+               if (__predict_false(xr == NULL))
+                       break;
+               xr->xr_parent = pxr;
+               fill_ring(xr);
+       }
+
+       if (__predict_false(pxr->xr_bqueue > 0)) {
+               SIMPLEQ_INSERT_TAIL(&xbdr_suspended, pxr,
+                   xr_suspended);
+               DPRINTF(XBDB_IO, ("xbdstart: suspended xbdreq %p "
+                   "for bp %p\n", pxr, bp));
+       } else if (CANGET_XBDREQ() && BUFQ_PEEK(&bufq) != NULL) {
+               /* 
+                * We have enough resources to start another bp and
+                * there are additional bps on the queue, dk_start
+                * will call us again and we'll run the queue then.
+                */
+               runqueue = 0;
+       }
+
+ out:
+       if (runqueue && last_req_prod != req_prod)
+               signal_requests_to_xen();
+
+       return ret;
+}
+
+static int
+xbd_response_handler(void *arg)
+{
+       struct buf *bp;
+       struct xbd_softc *xs;
+       blk_ring_resp_entry_t *ring_resp;
+       struct xbdreq *pxr, *xr;
+       int i;
+
+       for (i = resp_cons; i != blk_ring->resp_prod; i = BLK_RING_INC(i)) {
+               ring_resp = &blk_ring->ring[MASK_BLK_IDX(i)].resp;
+               xr = (struct xbdreq *)ring_resp->id;
+               pxr = xr->xr_parent;
+
+               DPRINTF(XBDB_IO, ("xbd_response_handler(%d): pxr %p xr %p "
+                   "bdone %04lx breq %04lx\n", i, pxr, xr, pxr->xr_bdone,
+                   xr->xr_breq));
+               pxr->xr_bdone -= xr->xr_breq;
+               DIAGCONDPANIC(pxr->xr_bdone < 0,
+                   ("xbd_response_handler: pxr->xr_bdone < 0"));
+
+               if (__predict_false(ring_resp->status)) {
+                       pxr->xr_bp->b_flags |= B_ERROR;
+                       pxr->xr_bp->b_error = EIO;
+               }
+
+               if (xr != pxr) {
+                       PUT_XBDREQ(xr);
+                       if (!SIMPLEQ_EMPTY(&xbdr_suspended))
+                               xbdresume();
+               }
+
+               if (pxr->xr_bdone == 0) {
+                       bp = pxr->xr_bp;
+                       xs = getxbd_softc(bp->b_dev);
+                       if (xs == NULL) { /* don't fail bp if we're shutdown */
+                               bp->b_flags |= B_ERROR;
+                               bp->b_error = EIO;
+                       }
+                       DPRINTF(XBDB_IO, ("xbd_response_handler(%d): "
+                           "completed bp %p\n", i, bp));
+                       if (bp->b_flags & B_ERROR)
+                               bp->b_resid = bp->b_bcount;
+                       else
+                               bp->b_resid = 0;
+
+                       if (pxr->xr_aligned)
+                               unmap_align(pxr);
+
+                       PUT_XBDREQ(pxr);
+                       if (xs)
+                               disk_unbusy(&xs->sc_dksc.sc_dkdev,
+                                   (bp->b_bcount - bp->b_resid),
+                                   (bp->b_flags & B_READ));
+                       biodone(bp);
+                       if (!SIMPLEQ_EMPTY(&xbdr_suspended))
+                               xbdresume();
+                       /* XXX possible lockup if this was the only
+                        * active device and requests were held back in
+                        * the queue.
+                        */
+                       if (xs)
+                               dk_iodone(xs->sc_di, &xs->sc_dksc);
+               }
+       }
+       resp_cons = i;
+       /* check if xbdresume queued any requests */
+       if (last_req_prod != req_prod)
+               signal_requests_to_xen();
+       return 0;
+}
+
+static struct device *
+find_device(xen_disk_t *xd)
+{
+       struct device *dv;
+       struct xbd_softc *xs;
+
+       for (dv = alldevs.tqh_first; dv != NULL; dv = dv->dv_list.tqe_next) {
+               if (dv->dv_cfattach == NULL ||
+                   dv->dv_cfattach->ca_attach != xbd_attach)
+                       continue;
+               xs = (struct xbd_softc *)dv;
+               if (xs->sc_xd_device == xd->device)
+                       break;
+       }
+       return dv;
+}
+
+static void
+xbd_update_create_kthread(void *arg)
+{
+
+       kthread_create1(xbd_update_kthread, arg, NULL, "xbdupdate");
+}
+
+static void
+xbd_update_kthread(void *arg)
+{
+       struct device *parent = arg;
+       struct xbd_attach_args *xbda;
+       struct device *dev;
+       xen_disk_t *xd;
+       xen_disk_t *vbd_info_update, *vbd_info_old;
+       int i, j, new_nr_vbds;
+       extern int hypervisor_print(void *, const char *);
+
+       MALLOC(vbd_info_update, xen_disk_t *, MAX_VBDS *
+           sizeof(xen_disk_t), M_DEVBUF, M_WAITOK);
+
+       for (;;) {
+               memset(vbd_info_update, 0, MAX_VBDS * sizeof(xen_disk_t));
+               new_nr_vbds  = get_vbd_info(vbd_info_update);
+
+               if (memcmp(vbd_info, vbd_info_update, MAX_VBDS *
+                   sizeof(xen_disk_t)) == 0) {
+                       FREE(vbd_info_update, M_DEVBUF);
+                       tsleep(parent, PWAIT, "xbdupd", 0);
+                       MALLOC(vbd_info_update, xen_disk_t *, MAX_VBDS *
+                           sizeof(xen_disk_t), M_DEVBUF, M_WAITOK);
+                       continue;
+               }
+
+               j = 0;
+               for (i = 0; i < new_nr_vbds; i++) {
+                       while (j < nr_vbds &&
+                           vbd_info[j].device < vbd_info_update[i].device) {
+                               DPRINTF(XBDB_HOTPLUG,
+                                   ("delete device %x size %lx\n",
+                                       vbd_info[j].device,
+                                       vbd_info[j].capacity));
+                               xd = &vbd_info[j];
+                               dev = find_device(xd);
+                               if (dev)
+                                       config_detach(dev, DETACH_FORCE);
+                               j++;
+                       }
+                       if (j < nr_vbds &&
+                           vbd_info[j].device == vbd_info_update[i].device) {
+                               DPRINTF(XBDB_HOTPLUG,
+                                   ("update device %x size %lx size %lx\n",
+                                       vbd_info_update[i].device,
+                                       vbd_info[j].capacity,
+                                       vbd_info_update[i].capacity));
+                               j++;
+                       } else {
+                               DPRINTF(XBDB_HOTPLUG,
+                                   ("add device %x size %lx\n",
+                                       vbd_info_update[i].device,
+                                       vbd_info_update[i].capacity));
+                               xd = &vbd_info_update[i];
+                               xbda = get_xbda(xd);
+                               if (xbda) {
+                                       xbda->xa_xd = xd;
+                                       config_found(parent, xbda, hypervisor_print);
+                               }
+                       }
+               }
+
+               while (j < nr_vbds) {
+                       DPRINTF(XBDB_HOTPLUG, ("delete device %x\n",
+                           vbd_info[j].device));
+                       xd = &vbd_info[j];
+                       dev = find_device(xd);
+                       if (dev)
+                               config_detach(dev, DETACH_FORCE);
+                       j++;
+               }
+
+               nr_vbds = new_nr_vbds;
+
+               vbd_info_old = vbd_info;
+               vbd_info = vbd_info_update;
+               vbd_info_update = vbd_info_old;
+       }
+}
+
+static int
+xbd_update_handler(void *arg)
+{
+
+       wakeup(arg);
+
+       return 0;
+}
+
+/* XXX: we should probably put these into dksubr.c, mostly */
+int
+xbdread(dev_t dev, struct uio *uio, int flags)
+{
+       struct  xbd_softc *xs;
+       struct  dk_softc *dksc;
+
+       DPRINTF_FOLLOW(("xbdread(%d, %p, %d)\n", dev, uio, flags));
+       GETXBD_SOFTC_CDEV(xs, dev);
+       dksc = &xs->sc_dksc;
+       if ((dksc->sc_flags & DKF_INITED) == 0)
+               return ENXIO;
+       /* XXX see the comments about minphys in ccd.c */
+       return physio(xbdstrategy, NULL, dev, B_READ, minphys, uio);
+}
+
+/* XXX: we should probably put these into dksubr.c, mostly */
+int
+xbdwrite(dev_t dev, struct uio *uio, int flags)
+{
+       struct  xbd_softc *xs;
+       struct  dk_softc *dksc;
+
+       DPRINTF_FOLLOW(("xbdwrite(%d, %p, %d)\n", dev, uio, flags));
+       GETXBD_SOFTC_CDEV(xs, dev);
+       dksc = &xs->sc_dksc;
+       if ((dksc->sc_flags & DKF_INITED) == 0)
+               return ENXIO;
+       /* XXX see the comments about minphys in ccd.c */
+       return physio(xbdstrategy, NULL, dev, B_WRITE, minphys, uio);
+}
+
+int
+xbdioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+       struct  xbd_softc *xs;
+       struct  dk_softc *dksc;
+       int     ret;
+
+       DPRINTF_FOLLOW(("xbdioctl(%d, %08lx, %p, %d, %p)\n",
+           dev, cmd, data, flag, p));
+       GETXBD_SOFTC(xs, dev);
+       dksc = &xs->sc_dksc;
+
+       if ((ret = lockmgr(&dksc->sc_lock, LK_EXCLUSIVE, NULL)) != 0)
+               return ret;
+
+       switch (cmd) {
+       default:
+               ret = dk_ioctl(xs->sc_di, dksc, dev, cmd, data, flag, p);
+               break;
+       }
+
+       lockmgr(&dksc->sc_lock, LK_RELEASE, NULL);
+       return ret;
+}
+
+int
+xbdioctl_cdev(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+       dev_t bdev;
+
+       bdev = devsw_chr2blk(dev);
+       if (bdev == NODEV)
+               return ENXIO;
+       return xbdioctl(bdev, cmd, data, flag, p);
+}
+
+int
+xbddump(dev_t dev, daddr_t blkno, caddr_t va, size_t size)
+{
+       struct  xbd_softc *xs;
+
+       DPRINTF_FOLLOW(("xbddump(%d, %" PRId64 ", %p, %lu)\n", dev, blkno, va,
+           (unsigned long)size));
+       GETXBD_SOFTC(xs, dev);
+       return dk_dump(xs->sc_di, &xs->sc_dksc, dev, blkno, va, size);
+}
+
+static int
+xbdinit(struct xbd_softc *xs, xen_disk_t *xd, struct dk_intf *dkintf)
+{
+       struct dk_geom *pdg;
+       char buf[9];
+       int ret;
+
+       ret = 0;
+
+       xs->sc_dksc.sc_size = xd->capacity;
+       xs->sc_xd_device = xd->device;
+       xs->sc_di = dkintf;
+       xs->sc_shutdown = 0;
+
+       /*
+        * XXX here we should probe the underlying device.  If we
+        *     are accessing a partition of type RAW_PART, then
+        *     we should populate our initial geometry with the
+        *     geometry that we discover from the device.
+        */
+       pdg = &xs->sc_dksc.sc_geom;
+       pdg->pdg_secsize = DEV_BSIZE;
+       pdg->pdg_ntracks = 1;
+       pdg->pdg_nsectors = 1024 * (1024 / pdg->pdg_secsize);
+       pdg->pdg_ncylinders = xs->sc_dksc.sc_size / pdg->pdg_nsectors;
+
+       /*
+        * We have one shared bufq for all devices because otherwise
+        * requests can stall if there were no free request slots
+        * available in xbdstart and this device had no requests
+        * in-flight which would trigger a dk_start from the interrupt
+        * handler.
+        * XXX this assumes that we can just memcpy struct bufq_state
+        *     to share it between devices.
+        * XXX we reference count the usage in case so we can de-alloc
+        *     the bufq if all devices are deconfigured.
+        */
+       if (bufq_users == 0) {
+               bufq_alloc(&bufq, BUFQ_FCFS);
+               bufq_users = 1;
+       }
+       memcpy(&xs->sc_dksc.sc_bufq, &bufq, sizeof(struct bufq_state));
+
+       xs->sc_dksc.sc_flags |= DKF_INITED;
+
+       /* Attach the disk. */
+       disk_attach(&xs->sc_dksc.sc_dkdev);
+
+       /* Try and read the disklabel. */
+       dk_getdisklabel(xs->sc_di, &xs->sc_dksc, 0 /* XXX ? */);
+
+       format_bytes(buf, sizeof(buf), (uint64_t)xs->sc_dksc.sc_size *
+           pdg->pdg_secsize);
+       printf(" %s\n", buf);
+
+/*   out: */
+       return ret;
+}
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c

new file mode 100644 (file)

index 0000000..8181f2b
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c
@@ -0,0 +1,444 @@
+/*     $NetBSD: xen_debug.c,v 1.1.2.1 2004/05/22 15:59:31 he Exp $     */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ *
+ * Copyright (c) 2002-2003, K A Fraser & R Neugebauer
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xen_debug.c,v 1.1.2.1 2004/05/22 15:59:31 he Exp $");
+
+#define XENDEBUG
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/stdarg.h>
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+
+#ifdef XENDEBUG
+
+#define PRINTK_BUFSIZE 1024
+void
+printk(const char *fmt, ...)
+{
+       va_list ap;
+       int ret;
+       static char buf[PRINTK_BUFSIZE];
+
+       va_start(ap, fmt);
+       ret = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap);
+       va_end(ap);
+       buf[ret] = 0;
+       (void)HYPERVISOR_console_io(CONSOLEIO_write, ret, buf);
+}
+
+void
+vprintk(const char *fmt, va_list ap)
+{
+       int ret;
+       static char buf[PRINTK_BUFSIZE];
+
+       ret = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap);
+       buf[ret] = 0;
+       (void)HYPERVISOR_console_io(CONSOLEIO_write, ret, buf);
+}
+
+#endif
+
+#ifdef XENDEBUG_LOW
+
+int xen_once = 0;
+
+void hypervisor_callback(void);
+void failsafe_callback(void);
+
+void xen_dbglow_init(void);
+void
+xen_dbglow_init()
+{
+       start_info_t *si;
+#if 0
+       int i;
+#endif
+
+       si = &xen_start_info;
+
+       HYPERVISOR_set_callbacks(
+               __KERNEL_CS, (unsigned long)hypervisor_callback,
+               __KERNEL_CS, (unsigned long)failsafe_callback);
+
+       trap_init();
+
+       /* __sti(); */
+
+       /* print out some useful information  */
+       printk(version);
+       printk("start_info:   %p\n",  si);
+       printk("  nr_pages:   %lu",   si->nr_pages);
+       printk("  shared_inf: %p (was %p)\n",  HYPERVISOR_shared_info,
+           si->shared_info);
+       printk("  pt_base:    %p",    (void *)si->pt_base); 
+       printk("  mod_start:  0x%lx\n", si->mod_start);
+       printk("  mod_len:    %lu\n", si->mod_len); 
+#if 0
+       printk("  net_rings: ");
+       for (i = 0; i < MAX_DOMAIN_VIFS; i++) {
+               if (si->net_rings[i] == 0)
+                       break;
+               printk(" %lx", si->net_rings[i]);
+       };
+       printk("\n");
+       printk("  blk_ring:   0x%lx\n", si->blk_ring);
+#endif
+       printk("  dom_id:     %d\n",  si->dom_id);
+       printk("  flags:      0x%lx\n", si->flags);
+       printk("  cmd_line:   %s\n",  si->cmd_line ?
+           (const char *)si->cmd_line : "NULL");
+}
+
+
+void xen_dbg0(char *);
+void
+xen_dbg0(char *end)
+{
+       struct cpu_info *ci;
+
+       ci = &cpu_info_primary;
+       if (xen_once)
+       printk("xencpu level %d ipending %08x master %08x\n",
+           ci->ci_ilevel, ci->ci_ipending, 
+           HYPERVISOR_shared_info->events_mask);
+           /* ipending %08x imask %08x iunmask %08x */
+           /* ci->ci_imask[IPL_NET], ci->ci_iunmask[IPL_NET]); */
+}
+
+void xen_dbg1(void *esp, int ss);
+void
+xen_dbg1(void *esp, int ss)
+{
+#if 1
+       struct cpu_info *ci;
+
+       ci = &cpu_info_primary;
+       if (xen_once)
+       printk("xenhighlevel %d ipending %08x master %08x events %08x\n",
+           ci->ci_ilevel, ci->ci_ipending, 
+           HYPERVISOR_shared_info->events_mask, HYPERVISOR_shared_info->events);
+#else
+       printk("stack switch %p %d/%d, sp %p\n", esp, ss, IDXSEL(ss), &ss);
+#endif
+}
+
+void xen_dbg2(void);
+void
+xen_dbg2(void)
+{
+       if (xen_once)
+       printk("xen_dbg2\n");
+}
+
+void xen_dbg3(void *, void *);
+void
+xen_dbg3(void *ss, void *esp)
+{
+       if (xen_once)
+       printk("xen_dbg3 %p %p\n", ss, esp);
+}
+
+void xen_dbg4(void *);
+void
+xen_dbg4(void *esi)
+{
+
+       printk("xen_dbg4 %p\n", esi);
+       for(;;);
+}
+
+
+
+
+static void do_exit(void);
+
+/*
+ * These are assembler stubs in vector.S.
+ * They are the actual entry points for virtual exceptions.
+ */
+void divide_error(void);
+void debug(void);
+void int3(void);
+void overflow(void);
+void bounds(void);
+void invalid_op(void);
+void device_not_available(void);
+void double_fault(void);
+void coprocessor_segment_overrun(void);
+void invalid_TSS(void);
+void segment_not_present(void);
+void stack_segment(void);
+void general_protection(void);
+void page_fault(void);
+void coprocessor_error(void);
+void simd_coprocessor_error(void);
+void alignment_check(void);
+void spurious_interrupt_bug(void);
+void machine_check(void);
+
+static void
+dump_regs(struct pt_regs *regs)
+{
+       int in_kernel = 1;
+       unsigned long esp;
+       unsigned short ss;
+
+       esp = (unsigned long) (&regs->esp);
+       ss = __KERNEL_DS;
+       if (regs->xcs & 2) {
+               in_kernel = 0;
+               esp = regs->esp;
+               ss = regs->xss & 0xffff;
+       }
+       printf("EIP:    %04x:[<%08lx>]\n",
+           0xffff & regs->xcs, regs->eip);
+       printf("EFLAGS: %08lx\n",regs->eflags);
+       printf("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+           regs->eax, regs->ebx, regs->ecx, regs->edx);
+       printf("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+           regs->esi, regs->edi, regs->ebp, esp);
+       printf("ds: %04x   es: %04x   ss: %04x\n",
+           regs->xds & 0xffff, regs->xes & 0xffff, ss);
+       printf("\n");
+}      
+
+
+static inline void
+dump_code(unsigned eip)
+{
+       unsigned *ptr = (unsigned *)eip;
+       int x;
+
+       printk("Bytes at eip:\n");
+       for (x = -4; x < 5; x++)
+               printf("%x", ptr[x]);
+}
+
+
+/*
+ * C handlers here have their parameter-list constructed by the
+ * assembler stubs above. Each one gets a pointer to a list
+ * of register values (to be restored at end of exception).
+ * Some will also receive an error code -- this is the code that
+ * was generated by the processor for the underlying real exception. 
+ * 
+ * Note that the page-fault exception is special. It also receives
+ * the faulting linear address. Normally this would be found in
+ * register CR2, but that is not accessible in a virtualised OS.
+ */
+
+static void inline
+do_trap(int trapnr, char *str, struct pt_regs *regs, long error_code)
+{
+
+       printk("FATAL:  Unhandled Trap (see mini-os:traps.c)");
+       printf("%d %s", trapnr, str);
+       dump_regs(regs);
+       dump_code(regs->eip);
+
+       do_exit();
+}
+
+#define DO_ERROR(trapnr, str, name) \
+void do_##name(struct pt_regs *regs, long error_code); \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+       do_trap(trapnr, str, regs, error_code); \
+}
+
+#define DO_ERROR_INFO(trapnr, str, name, sicode, siaddr) \
+void do_##name(struct pt_regs *regs, long error_code); \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+       do_trap(trapnr, str, regs, error_code); \
+}
+
+DO_ERROR_INFO( 0, "divide error", divide_error, FPE_INTDIV, regs->eip)
+DO_ERROR( 3, "int3", int3)
+DO_ERROR( 4, "overflow", overflow)
+DO_ERROR( 5, "bounds", bounds)
+DO_ERROR_INFO( 6, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip)
+DO_ERROR( 7, "device not available", device_not_available)
+DO_ERROR( 8, "double fault", double_fault)
+DO_ERROR( 9, "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, "invalid TSS", invalid_TSS)
+DO_ERROR(11, "segment not present", segment_not_present)
+DO_ERROR(12, "stack segment", stack_segment)
+DO_ERROR_INFO(17, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR(18, "machine check", machine_check)
+
+void do_page_fault(struct pt_regs *, long, unsigned long);
+void
+do_page_fault(struct pt_regs *regs, long error_code, unsigned long address)
+{
+
+       printk("Page fault\n");
+       printk("Address: 0x%lx", address);
+       printk("Error Code: 0x%lx", error_code);
+       printk("eip: \t 0x%lx", regs->eip);
+       do_exit();
+}
+
+void do_general_protection(struct pt_regs *, long);
+void
+do_general_protection(struct pt_regs *regs, long error_code)
+{
+
+       HYPERVISOR_shared_info->events_mask = 0;
+       printk("GPF\n");
+       printk("Error Code: 0x%lx", error_code);
+       dump_regs(regs);
+       dump_code(regs->eip);
+       do_exit();
+}
+
+
+void do_debug(struct pt_regs *, long);
+void
+do_debug(struct pt_regs *regs, long error_code)
+{
+
+       printk("Debug exception\n");
+#define TF_MASK 0x100
+       regs->eflags &= ~TF_MASK;
+       dump_regs(regs);
+       do_exit();
+}
+
+
+
+void do_coprocessor_error(struct pt_regs *, long);
+void
+do_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+
+       printk("Copro error\n");
+       dump_regs(regs);
+       dump_code(regs->eip);
+       do_exit();
+}
+
+void simd_math_error(void *);
+void
+simd_math_error(void *eip)
+{
+
+       printk("SIMD error\n");
+}
+
+void do_simd_coprocessor_error(struct pt_regs *, long);
+void
+do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+
+       printk("SIMD copro error\n");
+}
+
+void do_spurious_interrupt_bug(struct pt_regs *, long);
+void
+do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+{
+}
+
+static void
+do_exit(void)
+{
+
+       HYPERVISOR_exit();
+}
+
+/*
+ * Submit a virtual IDT to teh hypervisor. This consists of tuples
+ * (interrupt vector, privilege ring, CS:EIP of handler).
+ * The 'privilege ring' field specifies the least-privileged ring that
+ * can trap to that vector using a software-interrupt instruction (INT).
+ */
+static trap_info_t trap_table[] = {
+    {  0, 0, __KERNEL_CS, (unsigned long)divide_error                },
+    {  1, 0, __KERNEL_CS, (unsigned long)debug                       },
+    {  3, 3, __KERNEL_CS, (unsigned long)int3                        },
+    {  4, 3, __KERNEL_CS, (unsigned long)overflow                    },
+    {  5, 3, __KERNEL_CS, (unsigned long)bounds                      },
+    {  6, 0, __KERNEL_CS, (unsigned long)invalid_op                  },
+    {  7, 0, __KERNEL_CS, (unsigned long)device_not_available        },
+    {  8, 0, __KERNEL_CS, (unsigned long)double_fault                },
+    {  9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
+    { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS                 },
+    { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present         },
+    { 12, 0, __KERNEL_CS, (unsigned long)stack_segment               },
+    { 13, 0, __KERNEL_CS, (unsigned long)general_protection          },
+    { 14, 0, __KERNEL_CS, (unsigned long)page_fault                  },
+    { 15, 0, __KERNEL_CS, (unsigned long)spurious_interrupt_bug      },
+    { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error           },
+    { 17, 0, __KERNEL_CS, (unsigned long)alignment_check             },
+    { 18, 0, __KERNEL_CS, (unsigned long)machine_check               },
+    { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error      },
+    {  0, 0,           0, 0                           }
+};
+    
+void
+trap_init(void)
+{
+
+       HYPERVISOR_set_trap_table(trap_table);    
+}
+#endif
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c

new file mode 100644 (file)

index 0000000..a151e3d
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c
@@ -0,0 +1,352 @@
+/*     $NetBSD: xencons.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $       */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xencons.c,v 1.1.2.1 2004/05/22 15:59:21 he Exp $");
+
+#include <sys/param.h>
+#include <sys/ioctl.h>
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/systm.h>
+#include <sys/device.h>
+#include <sys/conf.h>
+
+#include <machine/stdarg.h>
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+
+#include <dev/cons.h>
+
+#include <ddb/db_output.h>     /* XXX for db_max_line */
+
+static int xencons_isconsole = 0;
+
+#define        XENCONS_UNIT(x) (minor(x))
+#define XENCONS_BURST 128
+
+int xencons_match (struct device *, struct cfdata *, void *);
+void xencons_attach (struct device *, struct device *, void *);
+/* int xencons_intr (void *); */
+void xencons_init (void);
+
+struct xencons_softc {
+       struct  device sc_dev;
+       struct  tty *sc_tty;
+};
+
+CFATTACH_DECL(xencons, sizeof(struct xencons_softc),
+    xencons_match, xencons_attach, NULL, NULL);
+
+extern struct cfdriver xencons_cd;
+
+dev_type_open(xencons_open);
+dev_type_close(xencons_close);
+dev_type_read(xencons_read);
+dev_type_write(xencons_write);
+dev_type_ioctl(xencons_ioctl);
+dev_type_stop(xencons_stop);
+dev_type_tty(xencons_tty);
+dev_type_poll(xencons_poll);
+
+const struct cdevsw xencons_cdevsw = {
+       xencons_open, xencons_close, xencons_read, xencons_write,
+       xencons_ioctl, xencons_stop, xencons_tty, xencons_poll,
+       NULL, ttykqfilter, D_TTY
+};
+
+
+void xenconscn_attach(void);
+int xenconscn_getc(dev_t);
+void xenconscn_putc(dev_t, int);
+void xenconscn_pollc(dev_t, int);
+
+static struct consdev xencons = {
+       NULL, NULL, xenconscn_getc, xenconscn_putc, xenconscn_pollc,
+       NULL, NULL, NULL, NODEV, CN_NORMAL
+};
+
+void   xencons_start (struct tty *);
+int    xencons_param (struct tty *, struct termios *);
+
+int
+xencons_match(struct device *parent, struct cfdata *match, void *aux)
+{
+       struct xencons_attach_args *xa = (struct xencons_attach_args *)aux;
+
+       if (strcmp(xa->xa_device, "xencons") == 0)
+               return 1;
+       return 0;
+}
+
+void
+xencons_attach(struct device *parent, struct device *self, void *aux)
+{
+       struct xencons_softc *sc = (void *)self;
+
+       aprint_normal(": Xen Virtual Console Driver\n");
+
+       if (xencons_isconsole) {
+               int maj;
+
+               /* Locate the major number. */
+               maj = cdevsw_lookup_major(&xencons_cdevsw);
+
+               /* There can be only one, but it can have any unit number. */
+               cn_tab->cn_dev = makedev(maj, sc->sc_dev.dv_unit);
+
+               aprint_verbose("%s: console major %d, unit %d\n",
+                   sc->sc_dev.dv_xname, maj, sc->sc_dev.dv_unit);
+
+               /* Set db_max_line to avoid paging. */
+               db_max_line = 0x7fffffff;
+       }
+}
+
+int
+xencons_open(dev_t dev, int flag, int mode, struct proc *p)
+{
+       struct xencons_softc *sc;
+       int unit = XENCONS_UNIT(dev);
+       struct tty *tp;
+
+       sc = device_lookup(&xencons_cd, unit);
+       if (sc == NULL)
+               return (ENXIO);
+
+       if (!sc->sc_tty) {
+               tp = sc->sc_tty = ttymalloc();
+               tty_attach(tp);
+       } else
+               tp = sc->sc_tty;
+
+       tp->t_oproc = xencons_start;
+       tp->t_param = xencons_param;
+       tp->t_dev = dev;
+       if ((tp->t_state & TS_ISOPEN) == 0) {
+               ttychars(tp);
+               tp->t_iflag = TTYDEF_IFLAG;
+               tp->t_oflag = TTYDEF_OFLAG;
+               tp->t_cflag = TTYDEF_CFLAG;
+               tp->t_lflag = TTYDEF_LFLAG;
+               tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED;
+               xencons_param(tp, &tp->t_termios);
+               ttsetwater(tp);
+       } else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0)
+               return (EBUSY);
+       tp->t_state |= TS_CARR_ON;
+
+       return ((*tp->t_linesw->l_open)(dev, tp));
+}
+
+int
+xencons_close(dev_t dev, int flag, int mode, struct proc *p)
+{
+       struct xencons_softc *sc = device_lookup(&xencons_cd,
+           XENCONS_UNIT(dev));
+       struct tty *tp = sc->sc_tty;
+
+       if (tp == NULL)
+               return (0);
+       (*tp->t_linesw->l_close)(tp, flag);
+       ttyclose(tp);
+#ifdef notyet /* XXX */
+       ttyfree(tp);
+#endif
+       return (0);
+}
+
+int
+xencons_read(dev_t dev, struct uio *uio, int flag)
+{
+       struct xencons_softc *sc = device_lookup(&xencons_cd,
+           XENCONS_UNIT(dev));
+       struct tty *tp = sc->sc_tty;
+
+       return ((*tp->t_linesw->l_read)(tp, uio, flag));
+}
+
+int
+xencons_write(dev_t dev, struct uio *uio, int flag)
+{
+       struct xencons_softc *sc = device_lookup(&xencons_cd,
+           XENCONS_UNIT(dev));
+       struct tty *tp = sc->sc_tty;
+
+       return ((*tp->t_linesw->l_write)(tp, uio, flag));
+}
+
+int
+xencons_poll(dev_t dev, int events, struct proc *p)
+{
+       struct xencons_softc *sc = device_lookup(&xencons_cd,
+           XENCONS_UNIT(dev));
+       struct tty *tp = sc->sc_tty;
+ 
+       return ((*tp->t_linesw->l_poll)(tp, events, p));
+}
+
+struct tty *
+xencons_tty(dev_t dev)
+{
+       struct xencons_softc *sc = device_lookup(&xencons_cd,
+           XENCONS_UNIT(dev));
+       struct tty *tp = sc->sc_tty;
+
+       return (tp);
+}
+
+int
+xencons_ioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+       struct xencons_softc *sc = device_lookup(&xencons_cd,
+           XENCONS_UNIT(dev));
+       struct tty *tp = sc->sc_tty;
+       int error;
+
+       error = (*tp->t_linesw->l_ioctl)(tp, cmd, data, flag, p);
+       if (error != EPASSTHROUGH)
+               return (error);
+
+       error = ttioctl(tp, cmd, data, flag, p);
+       if (error != EPASSTHROUGH)
+               return (error);
+
+       switch (cmd) {
+       default:
+               return (EPASSTHROUGH);
+       }
+
+#ifdef DIAGNOSTIC
+       panic("xencons_ioctl: impossible");
+#endif
+}
+
+void
+xencons_start(struct tty *tp)
+{
+       struct clist *cl;
+       int s, len;
+       u_char buf[XENCONS_BURST+1];
+
+       s = spltty();
+       if (tp->t_state & (TS_TIMEOUT | TS_BUSY | TS_TTSTOP))
+               goto out;
+       tp->t_state |= TS_BUSY;
+       splx(s);
+
+       /*
+        * We need to do this outside spl since it could be fairly
+        * expensive and we don't want our serial ports to overflow.
+        */
+       cl = &tp->t_outq;
+       len = q_to_b(cl, buf, XENCONS_BURST);
+       (void)HYPERVISOR_console_io(CONSOLEIO_write, len, buf);
+
+       s = spltty();
+       tp->t_state &= ~TS_BUSY;
+       if (cl->c_cc) {
+               tp->t_state |= TS_TIMEOUT;
+               callout_reset(&tp->t_rstrt_ch, 1, ttrstrt, tp);
+       }
+       if (cl->c_cc <= tp->t_lowat) {
+               if (tp->t_state & TS_ASLEEP) {
+                       tp->t_state &= ~TS_ASLEEP;
+                       wakeup(cl);
+               }
+               selwakeup(&tp->t_wsel);
+       }
+out:
+       splx(s);
+}
+
+void
+xencons_stop(struct tty *tp, int flag)
+{
+
+}
+
+
+
+void
+xenconscn_attach()
+{
+
+       cn_tab = &xencons;
+
+       xencons_isconsole = 1;
+}
+
+int
+xenconscn_getc(dev_t dev)
+{
+
+       printf("\n");
+       for (;;);
+}
+
+#define MAXLINELEN 1024
+void
+xenconscn_putc(dev_t dev, int c)
+{
+       static char buf[1024+1];
+       static int bufpos = 0;
+
+       buf[bufpos++] = c;
+       if (c == '\n') {
+               buf[bufpos] = 0;
+               (void)HYPERVISOR_console_io(CONSOLEIO_write, bufpos, buf);
+               bufpos = 0;
+       }
+}
+
+void
+xenconscn_pollc(dev_t dev, int on)
+{
+       
+}
+
+/*
+ * Set line parameters.
+ */
+int
+xencons_param(struct tty *tp, struct termios *t)
+{
+
+       tp->t_ispeed = t->c_ispeed;
+       tp->t_ospeed = t->c_ospeed;
+       tp->t_cflag = t->c_cflag;
+       return (0);
+}
+
diff --git a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c

new file mode 100644 (file)

index 0000000..e546155
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c
@@ -0,0 +1,600 @@
+/* $NetBSD: xenkbc.c,v 1.3.2.1 2004/05/22 15:57:43 he Exp $ */
+
+/*
+ *
+ * Copyright (c) 2004 Christian Limpach.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Christian Limpach.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2004 Ben Harris.
+ * Copyright (c) 1998
+ *     Matthias Drochner.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: xenkbc.c,v 1.3.2.1 2004/05/22 15:57:43 he Exp $");
+
+#include <sys/param.h>
+#include <sys/device.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+
+#include <dev/pckbport/pckbportvar.h>
+#include <dev/ic/i8042reg.h>
+
+#include <machine/intr.h>
+
+#include <machine/xenkbcvar.h>
+#include <machine/xen.h>
+#include <machine/hypervisor.h>
+#include <machine/hypervisor-ifs/kbd.h>
+#include <machine/evtchn.h>
+
+#define        KBC_DELAY       DELAY(1000)
+#define        KBC_TIMEOUT     250
+
+#define        XENKBC_NSLOTS   2
+
+/* data per slave device */
+struct xenkbc_slotdata {
+       int xsd_polling;        /* don't process data in interrupt handler */
+       int xsd_poll_data;      /* data read from inr handler if polling */
+       int xsd_poll_stat;      /* status read from inr handler if polling */
+#if NRND > 0
+       rndsource_element_t     xsd_rnd_source;
+#endif
+};
+
+struct xenkbc_internal {
+       struct xenkbc_softc     *xi_sc;
+       struct pckbport_tag     *xi_pt;
+       struct xenkbc_slotdata  *xi_slotdata[XENKBC_NSLOTS];
+       int                     xi_flags;
+       int                     xi_data;
+       int                     xi_8042cmdbyte;
+};
+
+#define        XI_CONSOLE_FLAG         0x01
+#define        XI_HASAUX_FLAG          0x02
+
+#define        XI_CONSOLE(xi)          ((xi)->xi_flags & XI_CONSOLE_FLAG)
+#define        XI_HASAUX(xi)           ((xi)->xi_flags & XI_HASAUX_FLAG)
+
+#define        XI_SETCONSOLE(xi,on)    \
+       ((on) ? ((xi)->xi_flags |= XI_CONSOLE_FLAG) : \
+               ((xi)->xi_flags &= ~XI_CONSOLE_FLAG))
+#define        XI_SETHASAUX(xi,on)     \
+       ((on) ? ((xi)->xi_flags |= XI_HASAUX_FLAG) : \
+               ((xi)->xi_flags &= ~XI_HASAUX_FLAG))
+
+static int xenkbc_match(struct device *, struct cfdata *, void *);
+static void xenkbc_attach(struct device *, struct device *, void *);
+
+static int xenkbc_xt_translation(void *, pckbport_slot_t, int);
+static void xenkbc_init_slotdata(struct xenkbc_slotdata *);
+
+static int xenkbc_get8042cmd (struct xenkbc_internal *);
+static int xenkbc_put8042cmd (struct xenkbc_internal *);
+static int xenkbc_send_devcmd(void *, pckbport_slot_t, u_char);
+static int xenkbc_send_cmd(void *, u_char);
+static int xenkbc_send_data(void *, u_char);
+static int xenkbc_poll_data1(void *, pckbport_slot_t);
+
+static void xenkbc_slot_enable(void *, pckbport_slot_t, int);
+static void xenkbc_intr_establish(void *, pckbport_slot_t);
+static void xenkbc_set_poll(void *, pckbport_slot_t, int);
+
+static int xenkbc_intr(void *);
+
+CFATTACH_DECL(xenkbc, sizeof(struct xenkbc_softc),
+    xenkbc_match, xenkbc_attach, NULL, NULL);
+
+static struct pckbport_accessops const xenkbc_ops = {
+       xenkbc_xt_translation,
+       xenkbc_send_devcmd,
+       xenkbc_poll_data1,
+       xenkbc_slot_enable,
+       xenkbc_intr_establish,
+       xenkbc_set_poll
+};
+
+static struct xenkbc_internal xenkbc_consdata;
+static struct xenkbc_slotdata xenkbc_cons_slotdata;
+
+/*  #define XENKBCDEBUG */
+#ifdef XENKBCDEBUG
+#define        DPRINTF(x) printf x
+#else
+#define        DPRINTF(x)
+#endif
+
+
+static int
+xenkbc_getstatus(struct xenkbc_internal *xi)
+{
+       long res;
+
+       res = HYPERVISOR_kbd_op(KBD_OP_READ, 0);
+       if (res < 0) {
+               xi->xi_data = 0;
+               return 0;
+       }
+       xi->xi_data = KBD_CODE_SCANCODE(res);
+       return KBD_CODE_STATUS(res);
+}
+
+static int
+xenkbc_wait_output(struct xenkbc_internal *xi)
+{
+       u_int i;
+
+       for (i = KBC_TIMEOUT; i; i--) {
+               if ((xenkbc_getstatus(xi) & KBS_IBF) == 0)
+                       return (1);
+               KBC_DELAY;
+       }
+       return (0);
+}
+
+static int
+xenkbc_match(struct device *parent, struct cfdata *cf, void *aux)
+{
+       struct xenkbc_attach_args *xa = aux;
+
+       if ((xen_start_info.flags & SIF_PRIVILEGED) == 0)
+               return 0;
+
+       if (strcmp(xa->xa_device, "xenkbc"))
+               return 0;
+
+       return 1;
+}
+
+static int
+xenkbc_attach_slot(struct xenkbc_softc *xs, pckbport_slot_t slot)
+{
+       struct xenkbc_internal *xi = xs->sc_xi;
+       struct device *child;
+       int alloced = 0;
+
+       if (xi->xi_slotdata[slot] == NULL) {
+               xi->xi_slotdata[slot] = malloc(sizeof(struct xenkbc_slotdata),
+                   M_DEVBUF, M_NOWAIT);
+               if (xi->xi_slotdata[slot] == NULL) {
+                       printf("%s: no memory\n", xs->sc_dev.dv_xname);
+                       return 0;
+               }
+               xenkbc_init_slotdata(xi->xi_slotdata[slot]);
+               alloced++;
+       }
+
+       child = pckbport_attach_slot(&xs->sc_dev, xi->xi_pt, slot);
+
+       if (child == NULL && alloced) {
+               free(xi->xi_slotdata[slot], M_DEVBUF);
+               xi->xi_slotdata[slot] = NULL;
+       }
+
+#if NRND > 0
+          if (child != NULL && xi->xi_slotdata[slot] != NULL)
+                  rnd_attach_source(&xi->xi_slotdata[slot]->xsd_rnd_source,
+                      child->dv_xname, RND_TYPE_TTY, 0);
+#endif
+
+       return child != NULL;
+}
+
+static void
+xenkbc_attach(struct device *parent, struct device *self, void *aux)
+{
+       /*  struct xenkbc_attach_args *xa = aux; */
+       struct xenkbc_softc *xs = (struct xenkbc_softc *)self;
+       struct xenkbc_internal *xi;
+       int res;
+       u_char cmdbits = 0;
+
+       if (XI_CONSOLE(&xenkbc_consdata))
+               xi = &xenkbc_consdata;
+       else {
+               xi = malloc(sizeof(struct xenkbc_internal), M_DEVBUF,
+                   M_NOWAIT | M_ZERO);
+               if (xi == NULL) {
+                       aprint_error(": no memory\n");
+                       return;
+               }
+               xi->xi_8042cmdbyte = KC8_CPU;
+       }
+
+       aprint_normal(": Xen Keyboard/Mouse Device\n");
+
+       xs->sc_xi = xi;
+       xi->xi_sc = xs;
+
+       event_set_handler(_EVENT_PS2, &xenkbc_intr, xi, IPL_TTY);
+       hypervisor_enable_event(_EVENT_PS2);
+
+       xi->xi_pt = pckbport_attach(xi, &xenkbc_ops);
+
+       /* flush */
+       xenkbc_poll_data1(xi, PCKBPORT_KBD_SLOT);
+
+       /* set initial cmd byte */
+       if (!xenkbc_put8042cmd(xi)) {
+               printf("kbc: cmd word write error\n");
+               return;
+       }
+
+       if (xenkbc_attach_slot(xs, PCKBPORT_KBD_SLOT))
+               cmdbits |= KC8_KENABLE;
+
+       /*
+        * Check aux port ok.
+        */
+       if (!xenkbc_send_cmd(xi, KBC_AUXECHO)) {
+               printf("kbc: aux echo error 1\n");
+               goto nomouse;
+       }
+       if (!xenkbc_wait_output(xi)) {
+               printf("kbc: aux echo error 2\n");
+               goto nomouse;
+       }
+       XI_SETHASAUX(xi, 1);
+       xenkbc_send_data(xi, 0x5a); /* a random value */
+       res = xenkbc_poll_data1(xi, PCKBPORT_AUX_SLOT);
+       if (res != -1) {
+               /*
+                * In most cases, the 0x5a gets echoed.
+                * Some older controllers (Gateway 2000 circa 1993)
+                * return 0xfe here.
+                * We are satisfied if there is anything in the
+                * aux output buffer.
+                */
+               if (xenkbc_attach_slot(xs, PCKBPORT_AUX_SLOT))
+                       cmdbits |= KC8_MENABLE;
+       } else {
+#ifdef XENKBCDEBUG
+               printf("kbc: aux echo test failed\n");
+#endif
+               XI_SETHASAUX(xi, 0);
+       }
+
+ nomouse:
+       /* enable needed interrupts */
+       xi->xi_8042cmdbyte |= cmdbits;
+       if (!xenkbc_put8042cmd(xi))
+               printf("kbc: cmd word write error\n");
+}
+
+static void
+xenkbc_init_slotdata(struct xenkbc_slotdata *xsd)
+{
+
+       xsd->xsd_polling = 0;
+}
+
+/*
+ * Get the current command byte.
+ */
+static int
+xenkbc_get8042cmd(struct xenkbc_internal *xi)
+{
+       int data;
+
+       if (!xenkbc_send_cmd(xi, K_RDCMDBYTE))
+               return 0;
+       data = xenkbc_poll_data1(xi, PCKBPORT_KBD_SLOT);
+       if (data == -1)
+               return 0;
+       xi->xi_8042cmdbyte = data;
+       return 1;
+}
+
+/*
+ * Pass command byte to keyboard controller (8042).
+ */
+static int
+xenkbc_put8042cmd(struct xenkbc_internal *xi)
+{
+
+       if (!xenkbc_send_cmd(xi, K_LDCMDBYTE))
+               return 0;
+       if (!xenkbc_wait_output(xi))
+               return 0;
+       return xenkbc_send_data(xi, xi->xi_8042cmdbyte);
+}
+
+static int
+xenkbc_send_devcmd(void *cookie, pckbport_slot_t slot, u_char devcmd)
+{
+
+       DPRINTF(("send_devcmd %x\n", devcmd));
+
+       if (slot == PCKBPORT_AUX_SLOT) {
+               if (!xenkbc_send_cmd(cookie, KBC_AUXWRITE)) {
+                       DPRINTF(("xenkbc_send_devcmd: KBC_AUXWRITE failed\n"));
+                       return 0;
+               }
+       }
+       if (!xenkbc_wait_output(cookie)) {
+               DPRINTF(("xenkbc_send_devcmd: wait_output failed\n"));
+               return 0;
+       }
+       return xenkbc_send_data(cookie, devcmd);
+}
+
+static int
+xenkbc_send_cmd(void *cookie, u_char cmd)
+{
+       struct xenkbc_internal *xi = cookie;
+
+       DPRINTF(("send_cmd %x\n", cmd));
+       xenkbc_wait_output(xi);
+       return !HYPERVISOR_kbd_op(KBD_OP_WRITECOMMAND, cmd);
+}
+
+static int
+xenkbc_send_data(void *cookie, u_char output)
+{
+       struct xenkbc_internal *xi = cookie;
+
+       DPRINTF(("send_data %x\n", output));
+       xenkbc_wait_output(xi);
+       return !HYPERVISOR_kbd_op(KBD_OP_WRITEOUTPUT, output);
+}
+
+static int
+xenkbc_poll_data1(void *cookie, pckbport_slot_t slot)
+{
+       struct xenkbc_internal *xi = cookie;
+       struct xenkbc_slotdata *xsd = xi->xi_slotdata[slot];
+       int s;
+       u_char stat, c;
+       int i = 1000;
+
+       s = splhigh();
+
+       if (xsd && xsd->xsd_polling && xsd->xsd_poll_data != -1 &&
+           xsd->xsd_poll_stat != -1) {
+               stat = xsd->xsd_poll_stat;
+               c = xsd->xsd_poll_data;
+               xsd->xsd_poll_data = -1;
+               xsd->xsd_poll_stat = -1;
+               goto process;
+       }
+
+       DELAY(10);
+       for (; i; i--) {
+               stat = xenkbc_getstatus(xi);
+               if (stat & KBS_DIB) {
+                       c = xi->xi_data;
+                       DELAY(10);
+               process:
+                       if (XI_HASAUX(xi) && (stat & 0x20)) { /* aux data */
+                               if (slot != PCKBPORT_AUX_SLOT) {
+#ifdef XENKBCDEBUG
+                                       printf("lost aux 0x%x\n", c);
+#endif
+                                       continue;
+                               }
+                       } else {
+                               if (slot == PCKBPORT_AUX_SLOT) {
+#ifdef XENKBCDEBUG
+                                       printf("lost kbd 0x%x\n", c);
+#endif
+                                       continue;
+                               }
+                       }
+                       splx(s);
+                       DPRINTF(("poll -> %x stat %x\n", c, stat));
+                       return c;
+               }
+       }
+
+       DPRINTF(("poll failed -> -1\n"));
+       splx(s);
+       return -1;
+}
+
+/*
+ * switch scancode translation on / off
+ * return nonzero on success
+ */
+static int
+xenkbc_xt_translation(void *cookie, pckbport_slot_t slot, int on)
+{
+       struct xenkbc_internal *xi = cookie;
+       int ison;
+
+       if (slot != PCKBPORT_KBD_SLOT) {
+               /* translation only for kbd slot */
+               if (on)
+                       return 0;
+               else
+                       return 1;
+       }
+
+       ison = xi->xi_8042cmdbyte & KC8_TRANS;
+       if ((on && ison) || (!on && !ison))
+               return 1;
+
+       xi->xi_8042cmdbyte ^= KC8_TRANS;
+       if (!xenkbc_put8042cmd(xi))
+               return 0;
+
+       /* read back to be sure */
+       if (!xenkbc_get8042cmd(xi))
+               return 0;
+
+       ison = xi->xi_8042cmdbyte & KC8_TRANS;
+       if ((on && ison) || (!on && !ison))
+               return 1;
+       return 0;
+}
+
+static const struct xenkbc_portcmd {
+       u_char cmd_en, cmd_dis;
+} xenkbc_portcmd[2] = {
+       {
+               KBC_KBDENABLE, KBC_KBDDISABLE,
+       }, {
+               KBC_AUXENABLE, KBC_AUXDISABLE,
+       }
+};
+
+static void
+xenkbc_slot_enable(void *cookie, pckbport_slot_t slot, int on)
+{
+       struct xenkbc_internal *xi = cookie;
+       const struct xenkbc_portcmd *cmd;
+
+       cmd = &xenkbc_portcmd[slot];
+
+       DPRINTF(("slot enable %d -> %d\n", slot, on));
+       xenkbc_send_cmd(xi, on ? cmd->cmd_en : cmd->cmd_dis);
+}
+
+
+static void
+xenkbc_intr_establish(void *cookie, pckbport_slot_t slot)
+{
+
+}
+
+static void
+xenkbc_set_poll(void *cookie, pckbport_slot_t slot, int on)
+{
+       struct xenkbc_internal *xi = cookie;
+
+       DPRINTF(("xenkbc_set_poll %d -> %d\n", slot, on));
+
+       xi->xi_slotdata[slot]->xsd_polling = on;
+
+       if (on) {
+               xi->xi_slotdata[slot]->xsd_poll_data = -1;
+               xi->xi_slotdata[slot]->xsd_poll_stat = -1;
+       } else {
+                int s;
+
+                /*
+                 * If disabling polling on a device that's been configured,
+                 * make sure there are no bytes left in the FIFO, holding up
+                 * the interrupt line.  Otherwise we won't get any further
+                 * interrupts.
+                 */
+               s = spltty();
+               xenkbc_intr(xi);
+               splx(s);
+       }
+}
+
+static int
+xenkbc_intr(void *self)
+{
+       struct xenkbc_internal *xi = self;
+       u_char stat;
+       pckbport_slot_t slot;
+       struct xenkbc_slotdata *xsd;
+       int served = 0;
+
+       for (;;) {
+               stat = xenkbc_getstatus(xi);
+               if (!(stat & KBS_DIB))
+                       break;
+
+               served = 1;
+
+               slot = (XI_HASAUX(xi) && (stat & 0x20)) ?
+                       PCKBPORT_AUX_SLOT : PCKBPORT_KBD_SLOT;
+               xsd = xi->xi_slotdata[slot];
+
+               if (xsd == NULL)
+                       continue;
+
+#if NRND > 0
+               rnd_add_uint32(&xsd->xsd_rnd_source,
+                   (stat << 8) | xi->xi_data);
+#endif
+
+               if (xsd->xsd_polling) {
+                       xsd->xsd_poll_data = xi->xi_data;
+                       xsd->xsd_poll_stat = stat;
+                       break; /* xenkbc_poll_data() will get it */
+               }
+
+               pckbportintr(xi->xi_pt, slot, xi->xi_data);
+       }
+
+       return served;
+}
+
+int
+xenkbc_cnattach(pckbport_slot_t slot)
+{
+       struct xenkbc_internal *xi = &xenkbc_consdata;
+       int ret;
+
+       /* flush */
+       (void) xenkbc_poll_data1(xi, PCKBPORT_KBD_SLOT);
+
+       /* init cmd byte, enable ports */
+       xenkbc_consdata.xi_8042cmdbyte = KC8_CPU;
+       if (!xenkbc_put8042cmd(xi)) {
+               printf("kbc: cmd word write error\n");
+               return EIO;
+       }
+
+       ret = pckbport_cnattach(xi, &xenkbc_ops, slot);
+
+       xi->xi_slotdata[slot] = &xenkbc_cons_slotdata;
+       xenkbc_init_slotdata(xi->xi_slotdata[slot]);
+       XI_SETCONSOLE(xi, 1);
+
+       return ret;
+}
diff --git a/netbsd-2.0-xen-sparse/sys/nfs/files.nfs b/netbsd-2.0-xen-sparse/sys/nfs/files.nfs

new file mode 100644 (file)

index 0000000..228c0c8
--- /dev/null
+++ b/netbsd-2.0-xen-sparse/sys/nfs/files.nfs
@@ -0,0 +1,34 @@
+#      $NetBSD: files.nfs,v 1.3 2004/03/11 21:48:43 cl Exp $
+
+deffs  fs_nfs.h                NFS
+
+defflag opt_nfs_boot.h         NFS_BOOT_BOOTP NFS_BOOT_BOOTPARAM NFS_BOOT_DHCP
+                               NFS_BOOT_GATEWAY NFS_BOOT_TCP
+                               NFS_BOOT_BOOTSTATIC
+
+defparam opt_nfs_boot.h                NFS_BOOT_BOOTP_REQFILE NFS_BOOT_OPTIONS
+                               NFS_BOOT_RWSIZE
+                               NFS_BOOTSTATIC_MYIP NFS_BOOTSTATIC_GWIP
+                               NFS_BOOTSTATIC_MASK NFS_BOOTSTATIC_SERVADDR
+                               NFS_BOOTSTATIC_SERVER
+
+defflag opt_nfs.h              NFS_V2_ONLY
+
+defflag                                NFSSERVER
+
+file   nfs/krpc_subr.c         nfs 
+file   nfs/nfs_bio.c           nfs 
+file   nfs/nfs_boot.c          nfs 
+file   nfs/nfs_bootdhcp.c      nfs & (nfs_boot_bootp | nfs_boot_dhcp)
+file   nfs/nfs_bootparam.c     nfs & nfs_boot_bootparam
+file   nfs/nfs_bootstatic.c    nfs & nfs_boot_bootstatic
+file   nfs/nfs_kq.c            nfs   
+file   nfs/nfs_node.c          nfs   
+file   nfs/nfs_nqlease.c       nfsserver | nfs
+file   nfs/nfs_serv.c          nfsserver
+file   nfs/nfs_socket.c        nfsserver | nfs
+file   nfs/nfs_srvcache.c      nfsserver
+file   nfs/nfs_subs.c          nfsserver | nfs
+file   nfs/nfs_syscalls.c      nfsserver | nfs
+file   nfs/nfs_vfsops.c        nfs  
+file   nfs/nfs_vnops.c         nfs
author	cl349@labyrinth.cl.cam.ac.uk <cl349@labyrinth.cl.cam.ac.uk>
	Mon, 6 Sep 2004 19:04:16 +0000 (19:04 +0000)
committer	cl349@labyrinth.cl.cam.ac.uk <cl349@labyrinth.cl.cam.ac.uk>
	Mon, 6 Sep 2004 19:04:16 +0000 (19:04 +0000)
.rootkeys		patch \| blob \| history
netbsd-2.0-xen-sparse/sys/arch/xen/conf/XEN	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/conf/files.xen	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/i386/autoconf.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/i386/gdt.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/i386/hypervisor_machdep.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/i386/locore.S	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/i386/machdep.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/i386/pmap.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/i386/sys_machdep.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/i386/vector.S	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/i386/xen_machdep.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/include/frameasm.h	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/include/if_xennetvar.h	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/include/pmap.h	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/include/xen.h	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/include/xenfunc.h	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/include/xenpmap.h	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/x86/bus_space.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/xen/clock.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/xen/hypervisor.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/xen/xbd.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/xen/xen_debug.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/xen/xencons.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/arch/xen/xen/xenkbc.c	[new file with mode: 0644]	patch \| blob
netbsd-2.0-xen-sparse/sys/nfs/files.nfs	[new file with mode: 0644]	patch \| blob